# GRPO Training for Bitcoin Enhanced Prediction (Final Output)

This notebook implements Group Relative Policy Optimization (GRPO) for comprehensive Bitcoin prediction with enhanced datasets.

**Dataset**: `bitcoin-enhanced-prediction-dataset-with-local-comprehensive-news`

**Training Method**: Group Relative Policy Optimization (GRPO)
- Preference learning through relative comparisons
- Custom reward system for Bitcoin prediction quality
- Multi-response generation and ranking
- Enhanced final output model

## Install Libraries

In [None]:
# !pip install -U unsloth
# !pip install trl
# !pip install accelerate
# !pip install datasets
# !pip install transformers

## Imports

In [None]:
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template
from trl import GRPOTrainer, GRPOConfig
from datasets import load_dataset, Dataset
import torch, random, os
from typing import Dict, List, Any
import json
import numpy as np
import pandas as pd
from datetime import datetime
from transformers import TrainingArguments
from tqdm import tqdm

SEED = 42
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

## Configuration

In [None]:
# Model and training configuration
MODEL_CONFIG = {
    "model_path": "./Qwen3-8B",  # Base model path
    # "model_path": "qwen_bitcoin_final_sft_enhanced/lora_adapter",  # Use this if loading from SFT checkpoint
    "max_seq_length": 4096,
    "dtype": torch.float16,
    "load_in_4bit": True,
    "lora_r": 32,
    "lora_alpha": 32,
    "lora_dropout": 0.0,
    "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
}

GRPO_CONFIG = {
    "output_dir": "qwen_bitcoin_final_grpo_only_enhanced",
    "num_train_epochs": 3,
    "per_device_train_batch_size": 1,
    "gradient_accumulation_steps": 8,
    "learning_rate": 5e-7,
    "logging_steps": 10,
    "save_steps": 50,
    "warmup_ratio": 0.1,
    "max_length": 1024,
    "max_prompt_length": 512,
    "beta": 0.1,
    "remove_unused_columns": False,
    "fp16": not is_bfloat16_supported(),
    "bf16": is_bfloat16_supported(),
}

DATASET_NAME = "tahamajs/bitcoin-enhanced-prediction-dataset-with-local-comprehensive-news"

## Load Model and Tokenizer

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    MODEL_CONFIG["model_path"],
    max_seq_length=MODEL_CONFIG["max_seq_length"],
    dtype=MODEL_CONFIG["dtype"],
    load_in_4bit=MODEL_CONFIG["load_in_4bit"],
)

# Apply chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template="chatml",  # Supports Qwen models
)

# Prepare model for training
model = FastLanguageModel.get_peft_model(
    model,
    r=MODEL_CONFIG["lora_r"],
    target_modules=MODEL_CONFIG["target_modules"],
    lora_alpha=MODEL_CONFIG["lora_alpha"],
    lora_dropout=MODEL_CONFIG["lora_dropout"],
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=SEED,
    use_rslora=False,
    loftq_config=None,
)

print(f"Model loaded: {MODEL_CONFIG['model_path']}")
print(f"Total parameters: {model.num_parameters():,}")
print(f"Trainable parameters: {model.num_parameters(only_trainable=True):,}")

## Special Tokens Setup

In [None]:
# Define special tokens for enhanced prediction
SPECIAL_TOKENS = ["<|response|>", "<|analysis|>", "<|forecast|>", "<|confidence|>", "<|reasoning|>", "<|thinking|>"]
num_added = tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS})
if num_added > 0:
    model.resize_token_embeddings(len(tokenizer))
    print(f"Added {num_added} special tokens")

RESPONSE_TAG = "<|response|>"
ANALYSIS_TAG = "<|analysis|>"
FORECAST_TAG = "<|forecast|>"
CONFIDENCE_TAG = "<|confidence|>"
REASONING_TAG = "<|reasoning|>"
THINKING_TAG = "<|thinking|>"

response_token_id = tokenizer.convert_tokens_to_ids(RESPONSE_TAG)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.truncation_side = "left"
tokenizer.padding_side = "right"

print(f"Special tokens: {SPECIAL_TOKENS}")
print(f"Response token ID: {response_token_id}")

## Load and Prepare Dataset

In [None]:
# Load dataset
raw_dataset = load_dataset(DATASET_NAME)
print(f"Loaded dataset: {DATASET_NAME}")
print(f"Dataset structure: {raw_dataset}")

train_data = raw_dataset["train"]
print(f"Total training samples: {len(train_data)}")

# Show sample data
print("\n=== Sample Data ===")
sample = train_data[0]
for key, value in sample.items():
    print(f"{key}: {str(value)[:100]}...")

## Data Processing for GRPO

In [None]:
def formatting_prompts_func(examples):
    """Format examples for Unsloth GRPO training"""
    instructions = examples.get("instruction", [""] * len(examples.get("input", [])))
    inputs = examples.get("input", [])
    outputs = examples.get("output", [])
    
    conversations = []
    for instruction, user_input, output in zip(instructions, inputs, outputs):
        messages = [
            {"role": "system", "content": instruction or ""},
            {"role": "user", "content": user_input or ""},
            {"role": "assistant", "content": output or ""},
        ]
        conversations.append(messages)
    
    return {"conversations": conversations}

# Format data for Unsloth GRPO
formatted_dataset = train_data.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=train_data.column_names
)

print(f"Formatted dataset for Unsloth GRPO: {len(formatted_dataset)} samples")

# Show sample formatted data
print("\n=== Sample Formatted Data ===")
sample_conversation = formatted_dataset[0]["conversations"]
for message in sample_conversation:
    print(f"{message['role']}: {message['content'][:100]}...")
print()

## Unsloth GRPO Training Setup

In [None]:
# Initialize Unsloth GRPO Trainer
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=GRPO_CONFIG["output_dir"],
    num_train_epochs=GRPO_CONFIG["num_train_epochs"],
    per_device_train_batch_size=GRPO_CONFIG["per_device_train_batch_size"],
    gradient_accumulation_steps=GRPO_CONFIG["gradient_accumulation_steps"],
    learning_rate=GRPO_CONFIG["learning_rate"],
    logging_steps=GRPO_CONFIG["logging_steps"],
    save_steps=GRPO_CONFIG["save_steps"],
    warmup_ratio=GRPO_CONFIG["warmup_ratio"],
    fp16=GRPO_CONFIG["fp16"],
    bf16=GRPO_CONFIG["bf16"],
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=SEED,
    remove_unused_columns=GRPO_CONFIG["remove_unused_columns"],
    dataloader_num_workers=2,
)

# Initialize GRPO trainer using Unsloth
grpo_trainer = GRPOTrainer(
    model=model,
    args=training_args,
    train_dataset=formatted_dataset,
    tokenizer=tokenizer,
    max_length=GRPO_CONFIG["max_length"],
    max_prompt_length=GRPO_CONFIG["max_prompt_length"],
    beta=GRPO_CONFIG["beta"],
    formatting_func=formatting_prompts_func,
)

print("✅ Unsloth GRPO Trainer initialized")
print(f"Model: {model.__class__.__name__}")
print(f"Training dataset: {len(formatted_dataset)} samples")
print(f"Max length: {GRPO_CONFIG['max_length']}")
print(f"Max prompt length: {GRPO_CONFIG['max_prompt_length']}")
print(f"Beta parameter: {GRPO_CONFIG['beta']}")

## Unsloth GRPO Training

In [None]:
# Initialize enhanced GRPO trainer
grpo_trainer = EnhancedGRPOTrainer(model, tokenizer, GRPO_CONFIG)

# Setup optimizer for GRPO
optimizer = AdamW(model.parameters(), lr=GRPO_CONFIG["learning_rate"])

print("🚀 Starting Enhanced GRPO training...")
print(f"Training on {len(grpo_batches)} batches for {GRPO_CONFIG['num_train_epochs']} epochs")

# Training loop
training_logs = []
global_step = 0

for epoch in range(GRPO_CONFIG["num_train_epochs"]):
    epoch_loss = 0.0
    epoch_batches = 0
    
    progress_bar = tqdm(grpo_batches, desc=f"Enhanced GRPO Epoch {epoch+1}/{GRPO_CONFIG['num_train_epochs']}")
    
    for batch_queries, batch_ground_truths in progress_bar:
        try:
            # Perform enhanced training step
            loss, responses, rewards = grpo_trainer.train_step(batch_queries, batch_ground_truths)
            
            if loss.requires_grad:
                # Backward pass
                optimizer.zero_grad()
                loss.backward()
                
                # Gradient clipping
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                
                optimizer.step()
                
                epoch_loss += loss.item()
                epoch_batches += 1
                global_step += 1
                
                # Update progress bar
                progress_bar.set_postfix({
                    'loss': f'{loss.item():.4f}',
                    'avg_loss': f'{epoch_loss/epoch_batches:.4f}'
                })
                
                # Enhanced logging
                if global_step % GRPO_CONFIG["logging_steps"] == 0:
                    # Calculate reward statistics
                    all_rewards = [r for batch_rewards in rewards for r in batch_rewards]
                    avg_reward = sum(all_rewards) / len(all_rewards) if all_rewards else 0
                    max_reward = max(all_rewards) if all_rewards else 0
                    min_reward = min(all_rewards) if all_rewards else 0
                    
                    log_entry = {
                        "epoch": epoch,
                        "step": global_step,
                        "loss": loss.item(),
                        "avg_epoch_loss": epoch_loss / epoch_batches,
                        "avg_reward": avg_reward,
                        "max_reward": max_reward,
                        "min_reward": min_reward,
                        "timestamp": datetime.now().isoformat()
                    }
                    training_logs.append(log_entry)
                    
                    print(f"\nStep {global_step}: Loss = {loss.item():.4f}")
                    print(f"Reward stats - Avg: {avg_reward:.3f}, Max: {max_reward:.3f}, Min: {min_reward:.3f}")
                    print(f"Sample responses for batch:")
                    for i, (query_responses, query_rewards) in enumerate(zip(responses[:1], rewards[:1])):
                        print(f"  Query {i+1} responses:")
                        for j, (resp, rew) in enumerate(zip(query_responses[:2], query_rewards[:2])):
                            print(f"    Response {j+1} (reward: {rew:.3f}): {resp[:120]}...")
                
                # Save checkpoint
                if global_step % GRPO_CONFIG["save_steps"] == 0:
                    checkpoint_dir = f"{GRPO_CONFIG['output_dir']}/checkpoint-{global_step}"
                    os.makedirs(checkpoint_dir, exist_ok=True)
                    model.save_pretrained(f"{checkpoint_dir}/lora_adapter")
                    tokenizer.save_pretrained(checkpoint_dir)
                    print(f"\n💾 Enhanced checkpoint saved at step {global_step}")
            
        except Exception as e:
            print(f"\n⚠️ Error in batch: {e}")
            continue
    
    avg_epoch_loss = epoch_loss / max(epoch_batches, 1)
    print(f"\n✅ Enhanced Epoch {epoch+1} completed. Average loss: {avg_epoch_loss:.4f}")

print("\n🎉 Enhanced GRPO training completed!")

## Save Final GRPO Model

In [None]:
# Save final enhanced GRPO model
final_model_dir = f"{GRPO_CONFIG['output_dir']}/final_model"
os.makedirs(final_model_dir, exist_ok=True)

model.save_pretrained(f"{final_model_dir}/lora_adapter")
tokenizer.save_pretrained(final_model_dir)

print(f"✅ Final Enhanced GRPO model saved to {final_model_dir}")

# Save enhanced GRPO training logs
with open(f"{GRPO_CONFIG['output_dir']}/enhanced_grpo_training_logs.json", "w") as f:
    json.dump(training_logs, f, indent=2)

print(f"Enhanced GRPO training logs saved to {GRPO_CONFIG['output_dir']}/enhanced_grpo_training_logs.json")

# Create comprehensive training summary
training_summary = {
    "model_type": "Enhanced GRPO for Bitcoin Prediction",
    "dataset": DATASET_NAME,
    "model_config": MODEL_CONFIG,
    "grpo_config": GRPO_CONFIG,
    "total_samples": len(train_data),
    "total_grpo_steps": global_step,
    "training_completed": datetime.now().isoformat(),
    "final_model_path": final_model_dir,
    "enhancement_features": [
        "Multi-dimensional reward system",
        "Technical analysis integration",
        "News sentiment analysis",
        "Confidence scoring",
        "Structured prediction format",
        "Enhanced preference learning"
    ]
}

with open(f"{GRPO_CONFIG['output_dir']}/enhanced_training_summary.json", "w") as f:
    json.dump(training_summary, f, indent=2)

print(f"Enhanced training summary saved to {GRPO_CONFIG['output_dir']}/enhanced_training_summary.json")

## Comprehensive Model Evaluation

In [None]:
# Test the enhanced final model
print("🧪 Testing the Enhanced GRPO model...")

def test_enhanced_model_generation(model, tokenizer, test_query, max_length=600):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(test_query, return_tensors="pt", truncation=True, max_length=2048)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        
        outputs = model.generate(
            **inputs,
            max_length=inputs["input_ids"].shape[1] + max_length,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
        
        response = tokenizer.decode(
            outputs[0][inputs["input_ids"].shape[1]:], 
            skip_special_tokens=True
        )
        return response.strip()

# Test multiple samples
test_samples = grpo_formatted[:3]

for i, test_sample in enumerate(test_samples):
    print(f"\n=== Test Sample {i+1} ===")
    test_query = test_sample["query"]
    ground_truth = test_sample["ground_truth"]
    
    print("Query:")
    print(test_query[:300] + "...")
    
    print("\nGround Truth:")
    print(ground_truth[:200] + "...")
    
    print("\nEnhanced Model Response:")
    response = test_enhanced_model_generation(model, tokenizer, test_query)
    print(response)
    
    # Calculate reward for the response
    reward = grpo_trainer._calculate_enhanced_prediction_reward(response, ground_truth)
    print(f"\nResponse Quality Score: {reward:.3f}")
    print("-" * 80)

print("\n✅ Enhanced model evaluation completed!")

## Final Training Summary

In [None]:
print("🏆 Enhanced GRPO Training Summary")
print("=" * 60)
print(f"Dataset: {DATASET_NAME}")
print(f"Total samples: {len(train_data):,}")
print(f"Training method: Enhanced Group Relative Policy Optimization (GRPO)")
print("\n🎯 Training Configuration:")
print(f"GRPO epochs: {GRPO_CONFIG['num_train_epochs']}")
print(f"Total GRPO steps: {global_step}")
print(f"Batch size: {GRPO_CONFIG['per_device_train_batch_size']}")
print(f"Learning rate: {GRPO_CONFIG['learning_rate']}")
print(f"Group size: {GRPO_CONFIG['group_size']}")
print("\n💾 Model Output:")
print(f"Final Enhanced GRPO model: {final_model_dir}")
print("\n🔬 Advanced Features:")
print("✅ Enhanced Group Relative Policy Optimization")
print("✅ Multi-dimensional reward system for Bitcoin predictions")
print("✅ Technical analysis integration (RSI, MACD, Moving Averages)")
print("✅ News sentiment and impact analysis")
print("✅ Confidence and reasoning scoring")
print("✅ Structured prediction format with special tokens")
print("✅ Enhanced preference learning with margin scaling")
print("✅ Comprehensive quality assessment")
print("\n🚀 Model Capabilities:")
print("• Advanced Bitcoin price prediction")
print("• Technical and fundamental analysis integration")
print("• News-driven market sentiment analysis")
print("• Confidence-aware forecasting")
print("• Structured reasoning and explanation")
print("\n🎉 Enhanced GRPO training pipeline completed successfully!")
print("\n📈 Ready for production Bitcoin prediction tasks!")