## 1. Setup and Imports

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import warnings
warnings.filterwarnings('ignore')

## 2. Load Model and Tokenizer

In [None]:
model_id = "ModelSpace/GemmaX2-28-9B-v0.1"

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_id)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Loading model on CPU...")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float32,
    device_map="cpu",
    low_cpu_mem_usage=True
)

print("Model loaded successfully!")

## 3. Helper Function for Text Generation

In [None]:
def generate_text(prompt, max_tokens=200, temperature=0.8, top_p=0.95):
    """
    Generate text using the Gemma model.
    
    Args:
        prompt: Input text prompt
        max_tokens: Maximum number of tokens to generate
        temperature: Sampling temperature (higher = more random)
        top_p: Nucleus sampling parameter
    
    Returns:
        Generated text string
    """
    # Format prompt for Gemma chat model
    formatted_prompt = f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
    
    inputs = tokenizer(formatted_prompt, return_tensors="pt")
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=1.15,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract only the model's response
    if "model" in generated_text:
        response = generated_text.split("model")[-1].strip()
    else:
        response = generated_text
    
    return response

## 4. Example Usage

In [None]:
# Example 1: General question
prompt = "Explain reinforcement learning in simple terms."
response = generate_text(prompt, max_tokens=150)
print(f"Prompt: {prompt}")
print(f"\nResponse: {response}")

In [None]:
# Example 2: Creative task
prompt = "Write a short story about a robot learning to make decisions."
response = generate_text(prompt, max_tokens=300, temperature=0.9)
print(f"Prompt: {prompt}")
print(f"\nResponse: {response}")

In [None]:
# Example 3: Code generation
prompt = "Write a Python function to calculate the Bellman equation for Q-learning."
response = generate_text(prompt, max_tokens=200, temperature=0.7)
print(f"Prompt: {prompt}")
print(f"\nResponse: {response}")

## 5. Batch Generation for Multiple Prompts

In [None]:
prompts = [
    "What is the difference between Q-learning and SARSA?",
    "Explain policy gradient methods.",
    "What are the main challenges in reinforcement learning?"
]

for i, prompt in enumerate(prompts, 1):
    print(f"\n{'='*60}")
    print(f"Prompt {i}: {prompt}")
    print('='*60)
    response = generate_text(prompt, max_tokens=150)
    print(response)

## 6. Integration with RL Workflow

You can use this model for:
- Generating synthetic training data
- Creating reward model descriptions
- Generating explanations for RL agent decisions
- Creating natural language interfaces for your RL system

In [None]:
# Example: Generate synthetic RL scenarios
def generate_rl_scenario(task_type="navigation"):
    prompt = f"""Describe a {task_type} scenario for training a reinforcement learning agent. 
    Include the environment, states, actions, and rewards."""
    
    return generate_text(prompt, max_tokens=250, temperature=0.8)

# Generate a navigation scenario
scenario = generate_rl_scenario("navigation")
print("Generated RL Scenario:")
print(scenario)

## Notes

- **Performance**: Currently running on CPU. Generation may be slow for long sequences.
- **GPU Support**: Once PyTorch adds support for RTX 5090 (sm_120), you can change `device_map="cpu"` to `device_map="auto"` to use GPU acceleration.
- **Model Behavior**: This is a multilingual model and may occasionally generate responses in other languages.
- **Memory**: The model requires significant RAM when running on CPU (~30GB for the full model).