# GRPO Training for Bitcoin Price Prediction

This notebook implements Group Relative Policy Optimization (GRPO) for enhanced Bitcoin price prediction performance.

**Dataset**: `bitcoin-enhanced-prediction-dataset-with-local-comprehensive-news`

**Training Method**: Group Relative Policy Optimization (GRPO)
- Preference learning through relative comparisons
- Custom reward system for Bitcoin prediction quality
- Multi-response generation and ranking

## Install Libraries

In [None]:
# !pip install -U unsloth
# !pip install trl
# !pip install accelerate
# !pip install datasets
# !pip install transformers

## Imports

In [None]:
from unsloth import FastLanguageModel
from datasets import load_dataset, Dataset
import torch, random, os
from typing import Dict, List, Any
import json
import numpy as np
import pandas as pd
from datetime import datetime
import torch.nn.functional as F
from torch.optim import AdamW
from tqdm import tqdm

SEED = 42
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

## Configuration

In [None]:
# Model and training configuration
MODEL_CONFIG = {
    "model_path": "./Qwen3-8B",  # Base model path
    # "model_path": "qwen_bitcoin_sft_enhanced/lora_adapter",  # Use this if loading from SFT checkpoint
    "max_seq_length": 4096,
    "dtype": torch.float16,
    "load_in_4bit": True,
    "lora_r": 32,
    "lora_alpha": 32,
    "lora_dropout": 0.0,
    "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
}

GRPO_CONFIG = {
    "output_dir": "qwen_bitcoin_grpo_only_enhanced",
    "num_train_epochs": 3,
    "per_device_train_batch_size": 2,
    "gradient_accumulation_steps": 4,
    "learning_rate": 1e-5,
    "logging_steps": 5,
    "save_steps": 100,
    "warmup_ratio": 0.1,
    "group_size": 4,  # Number of responses to compare per group
    "temperature": 0.7,  # For response generation
}

DATASET_NAME = "tahamajs/bitcoin-enhanced-prediction-dataset-with-local-comprehensive-news"

## Load Model and Tokenizer

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    MODEL_CONFIG["model_path"],
    max_seq_length=MODEL_CONFIG["max_seq_length"],
    dtype=MODEL_CONFIG["dtype"],
    load_in_4bit=MODEL_CONFIG["load_in_4bit"],
)

# Prepare model for training
FastLanguageModel.for_training(model)
model = FastLanguageModel.get_peft_model(
    model,
    r=MODEL_CONFIG["lora_r"],
    target_modules=MODEL_CONFIG["target_modules"],
    lora_alpha=MODEL_CONFIG["lora_alpha"],
    lora_dropout=MODEL_CONFIG["lora_dropout"],
    use_rslora=True,
)

print(f"Model loaded: {MODEL_CONFIG['model_path']}")
print(f"Total parameters: {model.num_parameters():,}")
print(f"Trainable parameters: {model.num_parameters(only_trainable=True):,}")

## Special Tokens Setup

In [None]:
# Define special tokens
SPECIAL_TOKENS = ["<|response|>", "<|analysis|>", "<|forecast|>", "<|thinking|>"]
num_added = tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS})
if num_added > 0:
    model.resize_token_embeddings(len(tokenizer))
    print(f"Added {num_added} special tokens")

RESPONSE_TAG = "<|response|>"
ANALYSIS_TAG = "<|analysis|>"
FORECAST_TAG = "<|forecast|>"
THINKING_TAG = "<|thinking|>"

response_token_id = tokenizer.convert_tokens_to_ids(RESPONSE_TAG)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.truncation_side = "left"
tokenizer.padding_side = "right"

print(f"Special tokens: {SPECIAL_TOKENS}")
print(f"Response token ID: {response_token_id}")

## Load and Prepare Dataset

In [None]:
# Load dataset
raw_dataset = load_dataset(DATASET_NAME)
print(f"Loaded dataset: {DATASET_NAME}")
print(f"Dataset structure: {raw_dataset}")

train_data = raw_dataset["train"]
print(f"Total training samples: {len(train_data)}")

# Show sample data
print("\n=== Sample Data ===")
sample = train_data[0]
for key, value in sample.items():
    print(f"{key}: {str(value)[:100]}...")

## Data Processing for GRPO

In [None]:
def format_chat_grpo(ex):
    """Format example for GRPO training (input only for generation)"""
    instruction = ex.get("instruction", "") or ""
    user_input = ex.get("input", "") or ""
    
    messages = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": user_input},
    ]
    return {
        "query": tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True),
        "ground_truth": ex.get("output", "")
    }

# Format data for GRPO
grpo_formatted = train_data.map(format_chat_grpo, remove_columns=train_data.column_names)
print(f"GRPO data prepared: {len(grpo_formatted)} samples")

# Create batches for GRPO training
def create_grpo_batches(dataset, batch_size=2):
    batches = []
    for i in range(0, len(dataset), batch_size):
        batch = dataset[i:i + batch_size]
        queries = [item["query"] for item in batch]
        ground_truths = [item["ground_truth"] for item in batch]
        batches.append((queries, ground_truths))
    return batches

grpo_batches = create_grpo_batches(grpo_formatted, batch_size=GRPO_CONFIG["per_device_train_batch_size"])
print(f"Created {len(grpo_batches)} GRPO batches")

# Show sample GRPO data
print("\n=== Sample GRPO Query ===")
print(grpo_formatted[0]["query"][:300] + "...")
print(f"\nGround truth: {grpo_formatted[0]['ground_truth'][:100]}...")

## GRPO Trainer Implementation

In [None]:
class GRPOTrainer:
    def __init__(self, model, tokenizer, config):
        self.model = model
        self.tokenizer = tokenizer
        self.config = config
        self.device = model.device
        
    def generate_responses(self, queries, num_responses=4, temperature=0.7, max_length=512):
        """
        Generate multiple responses for each query
        """
        self.model.eval()
        responses = []
        
        with torch.no_grad():
            for query in queries:
                query_responses = []
                inputs = self.tokenizer(query, return_tensors="pt", truncation=True, max_length=2048)
                inputs = {k: v.to(self.device) for k, v in inputs.items()}
                
                for _ in range(num_responses):
                    outputs = self.model.generate(
                        **inputs,
                        max_length=inputs["input_ids"].shape[1] + max_length,
                        temperature=temperature,
                        do_sample=True,
                        pad_token_id=self.tokenizer.eos_token_id,
                        eos_token_id=self.tokenizer.eos_token_id,
                    )
                    
                    response = self.tokenizer.decode(
                        outputs[0][inputs["input_ids"].shape[1]:], 
                        skip_special_tokens=True
                    )
                    query_responses.append(response.strip())
                
                responses.append(query_responses)
        
        return responses
    
    def compute_rewards(self, queries, responses, ground_truths):
        """
        Compute rewards for responses based on Bitcoin prediction quality
        """
        rewards = []
        
        for query_responses, gt in zip(responses, ground_truths):
            query_rewards = []
            
            for response in query_responses:
                reward = self._calculate_response_reward(response, gt)
                query_rewards.append(reward)
            
            rewards.append(query_rewards)
        
        return rewards
    
    def _calculate_response_reward(self, response, ground_truth):
        """
        Calculate reward for a response based on Bitcoin prediction criteria
        """
        reward = 0.0
        
        # Length penalty/reward
        if 50 <= len(response) <= 500:
            reward += 0.1
        
        # Format check (contains numbers/predictions)
        if any(char.isdigit() for char in response):
            reward += 0.2
        
        # Contains forecast tag
        if FORECAST_TAG in response:
            reward += 0.3
        
        # Contains analysis keywords
        bitcoin_keywords = ['bitcoin', 'price', 'forecast', 'prediction', 'market', 'trend']
        response_lower = response.lower()
        keyword_count = sum(1 for keyword in bitcoin_keywords if keyword in response_lower)
        reward += min(0.2, keyword_count * 0.05)
        
        # Simple text similarity with ground truth
        common_words = set(response.lower().split()) & set(ground_truth.lower().split())
        if len(common_words) > 0:
            reward += min(0.4, len(common_words) * 0.02)
        
        return reward
    
    def compute_grpo_loss(self, queries, responses, rewards):
        """
        Compute GRPO loss based on relative preferences within groups
        """
        self.model.train()
        total_loss = 0.0
        num_pairs = 0
        
        for query, query_responses, query_rewards in zip(queries, responses, rewards):
            if len(query_responses) < 2:
                continue
            
            # Tokenize query and responses
            query_tokens = self.tokenizer(query, return_tensors="pt", truncation=True, max_length=2048)
            query_tokens = {k: v.to(self.device) for k, v in query_tokens.items()}
            
            response_logprobs = []
            
            for response in query_responses:
                # Create full text (query + response)
                full_text = query + " " + response
                tokens = self.tokenizer(full_text, return_tensors="pt", truncation=True, max_length=4096)
                tokens = {k: v.to(self.device) for k, v in tokens.items()}
                
                # Get model outputs
                outputs = self.model(**tokens)
                logits = outputs.logits
                
                # Calculate log probabilities for the response part
                query_length = query_tokens["input_ids"].shape[1]
                response_logits = logits[0, query_length-1:-1]  # Shift for next token prediction
                response_tokens = tokens["input_ids"][0, query_length:]
                
                if response_tokens.numel() > 0:
                    log_probs = F.log_softmax(response_logits, dim=-1)
                    response_log_prob = log_probs.gather(1, response_tokens.unsqueeze(-1)).squeeze(-1)
                    avg_log_prob = response_log_prob.mean()
                    response_logprobs.append(avg_log_prob)
                else:
                    response_logprobs.append(torch.tensor(0.0, device=self.device))
            
            if len(response_logprobs) >= 2:
                # Compute pairwise losses
                for i in range(len(response_logprobs)):
                    for j in range(i + 1, len(response_logprobs)):
                        if query_rewards[i] != query_rewards[j]:  # Only if different rewards
                            # Preference: higher reward should have higher log prob
                            if query_rewards[i] > query_rewards[j]:
                                preferred_logprob = response_logprobs[i]
                                dispreferred_logprob = response_logprobs[j]
                            else:
                                preferred_logprob = response_logprobs[j]
                                dispreferred_logprob = response_logprobs[i]
                            
                            # GRPO loss: negative log sigmoid of difference
                            diff = preferred_logprob - dispreferred_logprob
                            loss = -F.logsigmoid(diff)
                            total_loss += loss
                            num_pairs += 1
        
        return total_loss / max(num_pairs, 1)
    
    def train_step(self, batch_queries, batch_ground_truths):
        """
        Perform one training step of GRPO
        """
        # Generate responses
        responses = self.generate_responses(
            batch_queries, 
            num_responses=self.config["group_size"],
            temperature=self.config["temperature"]
        )
        
        # Compute rewards
        rewards = self.compute_rewards(batch_queries, responses, batch_ground_truths)
        
        # Compute and return GRPO loss
        loss = self.compute_grpo_loss(batch_queries, responses, rewards)
        
        return loss, responses, rewards

print("✅ GRPO Trainer class defined")

## GRPO Training Loop

In [None]:
# Initialize GRPO trainer
grpo_trainer = GRPOTrainer(model, tokenizer, GRPO_CONFIG)

# Setup optimizer for GRPO
optimizer = AdamW(model.parameters(), lr=GRPO_CONFIG["learning_rate"])

print("🚀 Starting GRPO training...")
print(f"Training on {len(grpo_batches)} batches for {GRPO_CONFIG['num_train_epochs']} epochs")

# Training loop
training_logs = []
global_step = 0

for epoch in range(GRPO_CONFIG["num_train_epochs"]):
    epoch_loss = 0.0
    epoch_batches = 0
    
    progress_bar = tqdm(grpo_batches, desc=f"GRPO Epoch {epoch+1}/{GRPO_CONFIG['num_train_epochs']}")
    
    for batch_queries, batch_ground_truths in progress_bar:
        try:
            # Perform training step
            loss, responses, rewards = grpo_trainer.train_step(batch_queries, batch_ground_truths)
            
            if loss.requires_grad:
                # Backward pass
                optimizer.zero_grad()
                loss.backward()
                
                # Gradient clipping
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                
                optimizer.step()
                
                epoch_loss += loss.item()
                epoch_batches += 1
                global_step += 1
                
                # Update progress bar
                progress_bar.set_postfix({
                    'loss': f'{loss.item():.4f}',
                    'avg_loss': f'{epoch_loss/epoch_batches:.4f}'
                })
                
                # Logging
                if global_step % GRPO_CONFIG["logging_steps"] == 0:
                    log_entry = {
                        "epoch": epoch,
                        "step": global_step,
                        "loss": loss.item(),
                        "avg_epoch_loss": epoch_loss / epoch_batches,
                        "timestamp": datetime.now().isoformat()
                    }
                    training_logs.append(log_entry)
                    
                    print(f"\nStep {global_step}: Loss = {loss.item():.4f}")
                    print(f"Sample responses for batch:")
                    for i, (query_responses, query_rewards) in enumerate(zip(responses[:1], rewards[:1])):
                        print(f"  Query {i+1} responses:")
                        for j, (resp, rew) in enumerate(zip(query_responses[:2], query_rewards[:2])):
                            print(f"    Response {j+1} (reward: {rew:.3f}): {resp[:100]}...")
                
                # Save checkpoint
                if global_step % GRPO_CONFIG["save_steps"] == 0:
                    checkpoint_dir = f"{GRPO_CONFIG['output_dir']}/checkpoint-{global_step}"
                    os.makedirs(checkpoint_dir, exist_ok=True)
                    model.save_pretrained(f"{checkpoint_dir}/lora_adapter")
                    tokenizer.save_pretrained(checkpoint_dir)
                    print(f"\n💾 Checkpoint saved at step {global_step}")
            
        except Exception as e:
            print(f"\n⚠️ Error in batch: {e}")
            continue
    
    avg_epoch_loss = epoch_loss / max(epoch_batches, 1)
    print(f"\n✅ Epoch {epoch+1} completed. Average loss: {avg_epoch_loss:.4f}")

print("\n🎉 GRPO training completed!")

## Save GRPO Model

In [None]:
# Save final GRPO model
final_model_dir = f"{GRPO_CONFIG['output_dir']}/final_model"
os.makedirs(final_model_dir, exist_ok=True)

model.save_pretrained(f"{final_model_dir}/lora_adapter")
tokenizer.save_pretrained(final_model_dir)

print(f"✅ Final GRPO model saved to {final_model_dir}")

# Save GRPO training logs
with open(f"{GRPO_CONFIG['output_dir']}/grpo_training_logs.json", "w") as f:
    json.dump(training_logs, f, indent=2)

print(f"GRPO training logs saved to {GRPO_CONFIG['output_dir']}/grpo_training_logs.json")

# Create training summary
training_summary = {
    "dataset": DATASET_NAME,
    "model_config": MODEL_CONFIG,
    "grpo_config": GRPO_CONFIG,
    "total_samples": len(train_data),
    "total_grpo_steps": global_step,
    "training_completed": datetime.now().isoformat(),
    "final_model_path": final_model_dir
}

with open(f"{GRPO_CONFIG['output_dir']}/training_summary.json", "w") as f:
    json.dump(training_summary, f, indent=2)

print(f"Training summary saved to {GRPO_CONFIG['output_dir']}/training_summary.json")

## Model Evaluation

In [None]:
# Test the final model
print("🧪 Testing the final GRPO model...")

def test_model_generation(model, tokenizer, test_query, max_length=512):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(test_query, return_tensors="pt", truncation=True, max_length=2048)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        
        outputs = model.generate(
            **inputs,
            max_length=inputs["input_ids"].shape[1] + max_length,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
        
        response = tokenizer.decode(
            outputs[0][inputs["input_ids"].shape[1]:], 
            skip_special_tokens=True
        )
        return response.strip()

# Create a test query
test_sample = grpo_formatted[0]
test_query = test_sample["query"]
ground_truth = test_sample["ground_truth"]

print("=== Test Query ===")
print(test_query[:500] + "...")

print("\n=== Ground Truth ===")
print(ground_truth)

print("\n=== Model Response ===")
response = test_model_generation(model, tokenizer, test_query)
print(response)

print("\n✅ Model evaluation completed!")

## Training Summary

In [None]:
print("📊 GRPO Training Summary")
print("=" * 50)
print(f"Dataset: {DATASET_NAME}")
print(f"Total samples: {len(train_data):,}")
print(f"Training method: Group Relative Policy Optimization (GRPO)")
print("\n🎯 Training Configuration:")
print(f"GRPO epochs: {GRPO_CONFIG['num_train_epochs']}")
print(f"Total GRPO steps: {global_step}")
print(f"Batch size: {GRPO_CONFIG['per_device_train_batch_size']}")
print(f"Learning rate: {GRPO_CONFIG['learning_rate']}")
print(f"Group size: {GRPO_CONFIG['group_size']}")
print("\n💾 Model Output:")
print(f"Final GRPO model: {final_model_dir}")
print("\n🔬 Key Features:")
print("✅ Group Relative Policy Optimization (GRPO) for preference learning")
print("✅ Custom reward system for Bitcoin prediction quality")
print("✅ Multi-response generation and ranking")
print("✅ Pairwise preference optimization")
print("✅ Special token handling for structured outputs")
print("\n🎉 GRPO training pipeline completed successfully!")