<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [3]</a>'.</span>

# Bitcoin Price Predictor: GRPO Training with Price Difference Rewards

This notebook implements Group Relative Policy Optimization (GRPO) training for a Bitcoin price prediction model that has already been fine-tuned with SFT. GRPO is a reinforcement learning technique similar to PPO that further improves model performance by optimizing for specific rewards - in this case, minimizing price prediction errors.

## 1. Install Required Libraries for GRPO Training

In [1]:
# Install required libraries for GRPO training
!pip install trl==0.7.6 transformers>=4.38.0 datasets accelerate bitsandbytes
!pip install wandb torch>=2.0.0 peft>=0.8.0 scipy evaluate
!pip install deepspeed

# Additional libraries for data processing and visualization
!pip install pandas matplotlib seaborn numpy

















In [2]:
# # Verify installation and check for compatibility issues
# import sys
# print(f"Python version: {sys.version}")

# try:
#     import torch
#     print(f"PyTorch version: {torch.__version__}")
#     print(f"CUDA available: {torch.cuda.is_available()}")
#     if torch.cuda.is_available():
#         print(f"CUDA version: {torch.version.cuda}")
#         print(f"GPU count: {torch.cuda.device_count()}")
# except ImportError as e:
#     print(f"PyTorch import error: {e}")

# try:
#     import torchvision
#     print(f"Torchvision version: {torchvision.__version__}")
# except ImportError as e:
#     print(f"Torchvision import error: {e}")

# try:
#     import transformers
#     print(f"Transformers version: {transformers.__version__}")
# except ImportError as e:
#     print(f"Transformers import error: {e}")

# try:
#     import peft
#     print(f"PEFT version: {peft.__version__}")
# except ImportError as e:
#     print(f"PEFT import error: {e}")

# print("\n✅ If all versions are displayed above without errors, you can proceed to the next cell.")
# print("❌ If you see import errors, please restart your kernel and re-run the installation cell.")

## 2. Load SFT-Trained Model and Tokenizer

<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from datasets import load_dataset
import json
import re
from scipy.stats import wilcoxon

# Load the enhanced model (assuming it's the same base as your other model)
base_model_id = './Qwen3-8B'
adapter_path = './my-awesome-model_final_bitcoin-enhanced-prediction-dataset-with-local-comprehensive-news-v2/checkpoint-400'  # Adjust based on your checkpoint

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from datasets import load_dataset
import json
import re
from scipy.stats import wilcoxon
from collections import defaultdict
import warnings

# Suppress warnings that might be caused by version mismatches
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# Set torch backend to avoid potential conflicts
import os
os.environ['TORCH_USE_CUDA_DSA'] = '1'

print("🔧 Environment configured with compatibility settings...")

# Load the base model for individual news training
base_model_id = './Qwen3_8B'

print("📦 Loading base model and tokenizer...")
try:
    # Load the base model and tokenizer with error handling
    model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
        low_cpu_mem_usage=True  # Help with memory issues
    )

    tokenizer = AutoTokenizer.from_pretrained(
        base_model_id,
        trust_remote_code=True
    )

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    print("✅ Individual News Model loaded successfully!")
    
except Exception as e:
    print(f"❌ Error loading model: {e}")
    print("💡 Try the following solutions:")
    print("1. Restart your kernel and re-run the installation cell")
    print("2. Check if the model path './Qwen3_8B' exists")
    print("3. Use the absolute path to your model")
    raise

  from .autonotebook import tqdm as notebook_tqdm


RuntimeError: Failed to import transformers.modeling_utils because of the following error (look up to see its traceback):
operator torchvision::nms does not exist

## 3. Setup Reward Function for Price Differences

In [None]:
def extract_prices_from_text(text):
    """Extract price predictions from model output"""
    # Look for patterns like numbers separated by commas
    price_pattern = r'(\d+(?:\.\d+)?(?:,\s*\d+(?:\.\d+)?)*)'  
    matches = re.findall(price_pattern, text)
    
    if matches:
        # Take the first match and split by comma
        prices_str = matches[0]
        try:
            prices = [float(p.strip()) for p in prices_str.split(',')]
            return prices
        except:
            return []
    return []

def calculate_price_difference_reward(predicted_prices, actual_prices, max_len=10):
    """Calculate reward based on price difference accuracy
    
    Lower price differences = higher rewards
    Correct price direction predictions = bonus rewards
    """
    # Ensure we have valid predictions
    if not predicted_prices or not actual_prices:
        return -10.0  # Penalty for invalid predictions
    
    # Truncate to minimum length and max_len for fair comparison
    min_len = min(len(predicted_prices), len(actual_prices), max_len)
    if min_len <= 1:  # Need at least 2 prices to calculate direction
        return -5.0  # Smaller penalty for partial predictions
        
    pred_truncated = np.array(predicted_prices[:min_len])
    actual_truncated = np.array(actual_prices[:min_len])
    
    # Calculate absolute differences
    abs_diffs = np.abs(pred_truncated - actual_truncated)
    mean_abs_diff = np.mean(abs_diffs)
    
    # Calculate direction accuracy (up/down/same)
    actual_direction = np.diff(actual_truncated)
    pred_direction = np.diff(pred_truncated)
    direction_correct = np.sign(actual_direction) == np.sign(pred_direction)
    direction_accuracy = np.mean(direction_correct) if len(direction_correct) > 0 else 0
    
    # Convert price differences to rewards (lower difference = higher reward)
    # Normalize by typical Bitcoin price volatility (e.g., $1000)
    price_diff_reward = 10.0 * np.exp(-mean_abs_diff / 1000)
    
    # Add bonus for direction accuracy
    direction_bonus = 5.0 * direction_accuracy
    
    # Combine rewards (price difference + direction bonus)
    total_reward = price_diff_reward + direction_bonus
    
    return float(total_reward)

def compute_rewards(predictions, references):
    """Compute rewards for a batch of predictions and references"""
    rewards = []
    
    for pred, ref in zip(predictions, references):
        # Extract predicted prices from model output
        predicted_prices = extract_prices_from_text(pred)
        
        # Extract actual prices from reference output
        actual_prices = extract_prices_from_text(ref)
        
        # Calculate reward based on price differences
        reward = calculate_price_difference_reward(predicted_prices, actual_prices)
        rewards.append(reward)
    
    return rewards

## 4. Create GRPO Dataset from Bitcoin Predictions

In [None]:
from datasets import load_dataset, Dataset
import pandas as pd
import random

print("Loading Bitcoin prediction dataset...")
# Load the Bitcoin prediction dataset
train_dataset = load_dataset('tahamajs/bitcoin-enhanced-prediction-dataset-with-local-comprehensive-news', split='train')
test_dataset = load_dataset('tahamajs/bitcoin-enhanced-prediction-dataset-with-local-comprehensive-news', split='train')

print(f"Loaded {len(train_dataset)} training samples and {len(test_dataset)} test samples")

def format_prompt(example):
    """Format prompt for the model"""
    instruction = example.get('instruction', '')
    user_input = example.get('input', '')
    
    return f"""<|im_start|>system
{instruction}<|im_end|>
<|im_start|>user
{user_input}<|im_end|>
<|im_start|>assistant
"""

# Prepare a subset for GRPO training (GRPO training can be computationally expensive)
num_samples = min(500, len(train_dataset))  # Adjust based on your computational resources
grpo_train_dataset = train_dataset.select(range(num_samples))

# Format prompts for GRPO training
grpo_data = []
for example in grpo_train_dataset:
    prompt = format_prompt(example)
    output = example.get('output', '')
    
    grpo_data.append({
        "prompt": prompt,
        "chosen": output,  # The ground truth output
        "rejected": None,  # Will be generated by the model during training
    })

# Convert to Dataset format
grpo_dataset = Dataset.from_pandas(pd.DataFrame(grpo_data))

print(f"Created GRPO dataset with {len(grpo_dataset)} samples")
print("Sample prompt:")
print(grpo_dataset[0]['prompt'])

## 5. Configure GRPO Training Arguments

In [None]:
from transformers import TrainingArguments

output_dir = "./grpo_bitcoin_price_predictor"

# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,                 # Number of training epochs
    per_device_train_batch_size=4,      # Batch size for training
    gradient_accumulation_steps=4,      # Number of updates steps to accumulate before backward pass
    learning_rate=1e-5,                 # Learning rate
    weight_decay=0.01,                  # Weight decay
    warmup_steps=100,                   # Number of warmup steps
    logging_steps=10,                   # Log every X updates steps
    eval_strategy="steps",        # Evaluation strategy
    eval_steps=100,                     # Evaluate every X steps
    save_strategy="steps",              # Save strategy
    save_steps=100,                     # Save checkpoint every X updates steps
    save_total_limit=3,                 # Maximum number of checkpoints to keep
    load_best_model_at_end=True,        # Load the best model when training finishes
    fp16=True,                          # Use FP16 precision
    report_to="none"                    # Disable wandb reporting (change to "wandb" to enable)
)

# GRPO specific hyperparameters
grpo_config = {
    "num_rollouts": 32,             # Number of rollouts per prompt
    "chunk_size": 4,                # Number of chunks to split the batch into
    "beta": 0.1,                    # KL penalty coefficient
    "lambda_coef": 0.95,            # GAE lambda coefficient
    "gamma": 0.99,                  # Discount factor
    "eps_clip": 0.2,                # PPO clip range
    "value_clip": 0.2,              # Value clip range
    "generate_during_eval": True,   # Generate completions during evaluation
    "max_new_tokens": 256,          # Maximum number of tokens to generate
    "temperature": 0.7,             # Temperature for generation
    "top_k": 50,                    # Top-k sampling
    "top_p": 0.95,                  # Top-p sampling
    "do_sample": True,              # Use sampling for generation
    "rollout_batch_size": 8         # Batch size for rollout generation
}

print("Training arguments and GRPO configuration ready")

## 6. Initialize GRPO Trainer

In [None]:
from trl import GroupPPOConfig, GroupPPOTrainer
from trl.core import LengthSampler

# 1. Define a single, comprehensive configuration object for GroupPPO
# This combines parameters that were previously in TrainingArguments and the separate RL config.
ppo_config = GroupPPOConfig(
    # --- Training loop parameters ---
    learning_rate=1e-5,
    batch_size=16,          # Combined from per_device_train_batch_size * gradient_accumulation_steps
    mini_batch_size=4,      # Corresponds to per_device_train_batch_size
    gradient_accumulation_steps=4,
    ppo_epochs=4,           # Number of optimization epochs per PPO phase
    
    # --- RL-specific parameters ---
    beta=0.1,               # KL penalty coefficient
    lambda_=0.95,           # GAE lambda coefficient (note the underscore)
    gamma=0.99,             # Discount factor
    cliprange=0.2,          # PPO clip range
    cliprange_value=0.2,    # Value function clip range
    vf_coef=0.1,            # Value function coefficient in the loss
    
    # --- Other trainer settings ---
    log_with=None,          # Set to "wandb" or "tensorboard" to enable logging
    tracker_project_name="bitcoin_grpo",
    seed=42,
    optimize_cuda_cache=True,
    target_kl=0.1
)

# 2. Set generation kwargs for rollouts
# These are passed to the `trainer.generate` method inside the training loop
generation_kwargs = {
    "min_length": -1,
    "max_new_tokens": 256,
    "temperature": 0.7,
    "top_k": 50,
    "top_p": 0.95,
    "do_sample": True,
    "pad_token_id": tokenizer.pad_token_id, # Assumes tokenizer is already loaded
}

# 3. Initialize the GRPO trainer
# Assumes model, tokenizer, and grpo_dataset are already loaded
print("Initializing GRPO trainer...")
grpo_trainer = GroupPPOTrainer(
    config=ppo_config,
    model=model,
    ref_model=None,  # It's good practice to have a reference model for KL divergence
    tokenizer=tokenizer,
    dataset=grpo_dataset,
    data_collator=None, # The trainer will use its default collator
)

print("GRPO trainer initialized successfully!")
print("\nNext, you will start the training loop (e.g., for batch in grpo_trainer.dataloader: ...)")



## 7. Run GRPO Training Loop

In [None]:
# Define a length sampler for response generation
length_sampler = LengthSampler(min_value=32, max_value=grpo_config['max_new_tokens'])

# Performance tracking variables
epochs = 3
reward_history = []
loss_history = []
kl_div_history = []

print("Starting GRPO training...")
for epoch in range(epochs):
    print(f"\n===== Epoch {epoch+1}/{epochs} =====")
    
    # Training loop for multiple batch iterations
    for batch_idx in range(10):  # Process 10 batches per epoch - adjust as needed
        print(f"Processing batch {batch_idx+1}/10")
        
        # Sample batch of prompts
        batch_indices = random.sample(range(len(grpo_dataset)), grpo_config['chunk_size'])
        batch = grpo_dataset.select(batch_indices)
        
        # Generate model responses
        prompts = batch['prompt']
        references = batch['chosen']
        
        # Step 1: Generate responses for reward computation
        response_tensors = []
        responses = []
        
        for prompt in prompts:
            prompt_tensor = tokenizer(prompt, return_tensors="pt").to(model.device)
            response_length = length_sampler()
            
            with torch.no_grad():
                response_tensor = grpo_trainer.generate(
                    prompt_tensor['input_ids'], 
                    **generation_kwargs
                )
                
            response = tokenizer.decode(
                response_tensor[0][prompt_tensor['input_ids'].shape[1]:],
                skip_special_tokens=True
            )
            responses.append(response)
            response_tensors.append(response_tensor)
        
        # Step 2: Compute rewards based on price differences
        rewards = compute_rewards(responses, references)
        mean_reward = sum(rewards) / len(rewards) if rewards else 0
        reward_history.append(mean_reward)
        
        print(f"Mean reward: {mean_reward:.4f}")
        
        # Step 3: Perform GRPO update
        stats = grpo_trainer.step(prompts, responses, rewards)
        
        # Track metrics
        if stats:
            loss_history.append(stats['ppo/loss/total'])
            kl_div_history.append(stats.get('ppo/kl', 0))
            
            print(f"Loss: {stats['ppo/loss/total']:.4f}, KL div: {stats.get('ppo/kl', 0):.4f}")
    
    # Save checkpoint at the end of each epoch
    checkpoint_path = f"{output_dir}/checkpoint-epoch-{epoch+1}"
    grpo_trainer.save_pretrained(checkpoint_path)
    print(f"Saved checkpoint to {checkpoint_path}")

print("GRPO training completed!")

## 8. Evaluate GRPO-Trained Model

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import wilcoxon

# Plot training metrics
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.plot(reward_history)
plt.title('Mean Reward History')
plt.xlabel('Training Step')
plt.ylabel('Reward')
plt.grid(True)

plt.subplot(1, 3, 2)
plt.plot(loss_history)
plt.title('Loss History')
plt.xlabel('Training Step')
plt.ylabel('Loss')
plt.grid(True)

plt.subplot(1, 3, 3)
plt.plot(kl_div_history)
plt.title('KL Divergence History')
plt.xlabel('Training Step')
plt.ylabel('KL Divergence')
plt.grid(True)

plt.tight_layout()
plt.savefig('grpo_training_metrics.png', dpi=300)
plt.show()

print("\nEvaluating GRPO-trained model...")

# Evaluate GRPO-trained model on test dataset
grpo_model = grpo_trainer.model
grpo_model.eval()

def evaluate_model(model, tokenizer, dataset, num_samples=50):
    results = []
    total_samples = min(num_samples, len(dataset))
    
    print(f"Running evaluation on {total_samples} samples...")
    
    for i in range(total_samples):
        test_example = dataset[i]
        prompt = format_prompt(test_example)
        
        inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=2048)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=256,
                do_sample=False,  # Use greedy decoding for consistency
                pad_token_id=tokenizer.eos_token_id
            )
        
        generated_text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
        
        # Extract predictions and ground truth
        predicted_prices = extract_prices_from_text(generated_text)
        actual_output = test_example.get('output', '')
        actual_prices = extract_prices_from_text(actual_output)
        
        # Calculate metrics if predictions are valid
        if predicted_prices and actual_prices:
            min_len = min(len(predicted_prices), len(actual_prices))
            if min_len > 0:
                pred_truncated = predicted_prices[:min_len]
                actual_truncated = actual_prices[:min_len]
                
                # Calculate absolute differences
                abs_diffs = np.abs(np.array(pred_truncated) - np.array(actual_truncated))
                mean_abs_diff = np.mean(abs_diffs)
                
                # Calculate direction accuracy
                actual_direction = np.diff(actual_truncated)
                pred_direction = np.diff(pred_truncated)
                direction_correct = np.sign(actual_direction) == np.sign(pred_direction)
                direction_accuracy = np.mean(direction_correct) * 100 if len(direction_correct) > 0 else 0
                
                # Calculate reward
                reward = calculate_price_difference_reward(pred_truncated, actual_truncated)
                
                results.append({
                    'sample_id': i,
                    'predicted': pred_truncated,
                    'actual': actual_truncated,
                    'mean_abs_diff': mean_abs_diff,
                    'direction_accuracy': direction_accuracy,
                    'reward': reward
                })
        
        if (i + 1) % 10 == 0:
            print(f"Processed {i + 1}/{total_samples} samples...")
    
    return results

# Evaluate GRPO model
grpo_results = evaluate_model(grpo_model, tokenizer, test_dataset)

# Calculate aggregate metrics
if grpo_results:
    grpo_abs_diffs = [r['mean_abs_diff'] for r in grpo_results]
    grpo_direction_accs = [r['direction_accuracy'] for r in grpo_results]
    grpo_rewards = [r['reward'] for r in grpo_results]
    
    print("\nGRPO Model Evaluation Results:")
    print(f"Mean Absolute Price Difference: ${np.mean(grpo_abs_diffs):.2f}")
    print(f"Median Absolute Price Difference: ${np.median(grpo_abs_diffs):.2f}")
    print(f"Mean Direction Accuracy: {np.mean(grpo_direction_accs):.2f}%")
    print(f"Mean Reward: {np.mean(grpo_rewards):.4f}")
else:
    print("No valid evaluation results for GRPO model")

## 9. Compare SFT vs GRPO Performance

In [None]:
# Reload the original SFT model for comparison
print("Loading SFT model for comparison...")
sft_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

sft_model = PeftModel.from_pretrained(sft_model, adapter_path)
sft_model.eval()

# Evaluate SFT model (pre-GRPO)
sft_results = evaluate_model(sft_model, tokenizer, test_dataset)

# Compare SFT and GRPO models
if sft_results and grpo_results:
    sft_abs_diffs = [r['mean_abs_diff'] for r in sft_results]
    sft_direction_accs = [r['direction_accuracy'] for r in sft_results]
    sft_rewards = [r['reward'] for r in sft_results]
    
    # Calculate improvements
    abs_diff_improvement = ((np.mean(sft_abs_diffs) - np.mean(grpo_abs_diffs)) / np.mean(sft_abs_diffs)) * 100
    direction_improvement = ((np.mean(grpo_direction_accs) - np.mean(sft_direction_accs)) / np.mean(sft_direction_accs)) * 100
    reward_improvement = ((np.mean(grpo_rewards) - np.mean(sft_rewards)) / np.mean(sft_rewards)) * 100 if np.mean(sft_rewards) != 0 else 0
    
    # Create comparison table
    comparison_metrics = {
        'Metric': [
            'Mean Absolute Price Difference ($)',
            'Median Absolute Price Difference ($)',
            'Mean Direction Accuracy (%)',
            'Mean Reward'
        ],
        'SFT Model': [
            f"{np.mean(sft_abs_diffs):.2f}",
            f"{np.median(sft_abs_diffs):.2f}",
            f"{np.mean(sft_direction_accs):.2f}",
            f"{np.mean(sft_rewards):.4f}"
        ],
        'GRPO Model': [
            f"{np.mean(grpo_abs_diffs):.2f}",
            f"{np.median(grpo_abs_diffs):.2f}",
            f"{np.mean(grpo_direction_accs):.2f}",
            f"{np.mean(grpo_rewards):.4f}"
        ],
        'Improvement (%)': [
            f"{abs_diff_improvement:.2f}",
            f"{((np.median(sft_abs_diffs) - np.median(grpo_abs_diffs)) / np.median(sft_abs_diffs) * 100):.2f}",
            f"{direction_improvement:.2f}",
            f"{reward_improvement:.2f}"
        ]
    }
    
    comparison_df = pd.DataFrame(comparison_metrics)
    print("\n=== SFT vs GRPO Model Comparison ===")
    print(comparison_df.to_string(index=False))
    
    # Statistical significance tests
    print("\n=== Statistical Significance Tests ===")
    try:
        abs_diff_stat, abs_diff_pval = wilcoxon(sft_abs_diffs, grpo_abs_diffs)
        direction_stat, direction_pval = wilcoxon(sft_direction_accs, grpo_direction_accs)
        reward_stat, reward_pval = wilcoxon(sft_rewards, grpo_rewards)
        
        alpha = 0.05
        print(f"Absolute Difference p-value: {abs_diff_pval:.6f} - {'Significant' if abs_diff_pval < alpha else 'Not significant'}")
        print(f"Direction Accuracy p-value: {direction_pval:.6f} - {'Significant' if direction_pval < alpha else 'Not significant'}")
        print(f"Reward p-value: {reward_pval:.6f} - {'Significant' if reward_pval < alpha else 'Not significant'}")
    except Exception as e:
        print(f"Statistical test error: {e}")
    
    # Visualization
    plt.figure(figsize=(15, 10))
    
    # Price difference comparison
    plt.subplot(2, 2, 1)
    sns.histplot([sft_abs_diffs, grpo_abs_diffs], bins=30, alpha=0.6, 
                 label=['SFT Model', 'GRPO Model'])
    plt.title('Absolute Price Differences Distribution')
    plt.xlabel('Absolute Difference ($)')
    plt.ylabel('Frequency')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Direction accuracy comparison
    plt.subplot(2, 2, 2)
    plt.boxplot([sft_direction_accs, grpo_direction_accs], labels=['SFT Model', 'GRPO Model'])
    plt.title('Direction Accuracy Comparison')
    plt.ylabel('Direction Accuracy (%)')
    plt.grid(True, alpha=0.3)
    
    # Reward comparison
    plt.subplot(2, 2, 3)
    plt.boxplot([sft_rewards, grpo_rewards], labels=['SFT Model', 'GRPO Model'])
    plt.title('Reward Comparison')
    plt.ylabel('Reward')
    plt.grid(True, alpha=0.3)
    
    # Improvement metrics
    plt.subplot(2, 2, 4)
    metrics = ['Abs Diff', 'Direction Acc', 'Reward']
    improvements = [abs_diff_improvement, direction_improvement, reward_improvement]
    
    colors = ['green' if imp > 0 else 'red' for imp in improvements]
    plt.bar(metrics, improvements, color=colors, alpha=0.7)
    
    # Add value labels on bars
    for i, imp in enumerate(improvements):
        plt.text(i, imp, f'{imp:.1f}%', ha='center', va='bottom' if imp > 0 else 'top',
                fontweight='bold')
    
    plt.title('GRPO Improvements over SFT')
    plt.ylabel('Improvement (%)')
    plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('sft_vs_grpo_comparison.png', dpi=300)
    plt.show()
    
    print("\nComparison visualization saved as 'sft_vs_grpo_comparison.png'")
    
else:
    print("Insufficient data for SFT vs GRPO comparison")

## 10. Save GRPO Model and Results

In [None]:
# Save the final GRPO model
final_output_dir = f"{output_dir}/final"
grpo_trainer.save_pretrained(final_output_dir)
print(f"Saved final GRPO model to {final_output_dir}")

# Save training metrics
training_metrics = {
    'reward_history': reward_history,
    'loss_history': loss_history,
    'kl_div_history': kl_div_history
}

import json

with open(f"{output_dir}/training_metrics.json", 'w') as f:
    json.dump(training_metrics, f, indent=2)

# Save evaluation results
if sft_results and grpo_results:
    eval_results = {
        'sft_model': {
            'model_id': base_model_id,
            'adapter_path': adapter_path,
            'results': [{
                'sample_id': r['sample_id'],
                'predicted': [float(x) for x in r['predicted']],
                'actual': [float(x) for x in r['actual']],
                'mean_abs_diff': float(r['mean_abs_diff']),
                'direction_accuracy': float(r['direction_accuracy']),
                'reward': float(r['reward'])
            } for r in sft_results],
            'metrics': {
                'mean_abs_diff': float(np.mean(sft_abs_diffs)),
                'median_abs_diff': float(np.median(sft_abs_diffs)),
                'mean_direction_accuracy': float(np.mean(sft_direction_accs)),
                'mean_reward': float(np.mean(sft_rewards))
            }
        },
        'grpo_model': {
            'model_id': base_model_id,
            'adapter_path': final_output_dir,
            'results': [{
                'sample_id': r['sample_id'],
                'predicted': [float(x) for x in r['predicted']],
                'actual': [float(x) for x in r['actual']],
                'mean_abs_diff': float(r['mean_abs_diff']),
                'direction_accuracy': float(r['direction_accuracy']),
                'reward': float(r['reward'])
            } for r in grpo_results],
            'metrics': {
                'mean_abs_diff': float(np.mean(grpo_abs_diffs)),
                'median_abs_diff': float(np.median(grpo_abs_diffs)),
                'mean_direction_accuracy': float(np.mean(grpo_direction_accs)),
                'mean_reward': float(np.mean(grpo_rewards))
            }
        },
        'improvements': {
            'abs_diff_improvement': float(abs_diff_improvement),
            'direction_improvement': float(direction_improvement),
            'reward_improvement': float(reward_improvement)
        }
    }
    
    with open(f"{output_dir}/evaluation_results.json", 'w') as f:
        json.dump(eval_results, f, indent=2)
    
    print(f"Saved evaluation results to {output_dir}/evaluation_results.json")

print("\n📊 SUMMARY: GRPO TRAINING FOR BITCOIN PRICE PREDICTION")
print("="* 60)
print("✅ Completed GRPO training with price difference rewards")
print("✅ Model saved and ready for inference")
print("✅ Performance analysis and comparisons completed")
print("\nTo use this model for inference, load it with:")
print("```python")
print("from transformers import AutoModelForCausalLM, AutoTokenizer")
print("from peft import PeftModel")
print(f"model = AutoModelForCausalLM.from_pretrained('{base_model_id}', trust_remote_code=True)")
print(f"model = PeftModel.from_pretrained(model, '{final_output_dir}')")
print(f"tokenizer = AutoTokenizer.from_pretrained('{base_model_id}', trust_remote_code=True)")
print("```")

In [None]:
# Sample prediction with GRPO model
print("Sample prediction with GRPO model:")

# Use a test sample
test_sample = test_dataset[0]
prompt = format_prompt(test_sample)

# Generate prediction
inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=2048)
inputs = {k: v.to(grpo_model.device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = grpo_model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )

generated_text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

# Extract predictions
predicted_prices = extract_prices_from_text(generated_text)
actual_output = test_sample.get('output', '')
actual_prices = extract_prices_from_text(actual_output)

print("Input prompt:")
print(prompt[:500] + "..." if len(prompt) > 500 else prompt)
print("\nGRPO model prediction:")
print(generated_text[:500] + "..." if len(generated_text) > 500 else generated_text)
print(f"\nPredicted prices: {predicted_prices}")
print(f"Actual prices: {actual_prices}")

if predicted_prices and actual_prices:
    min_len = min(len(predicted_prices), len(actual_prices))
    if min_len > 0:
        pred_truncated = predicted_prices[:min_len]
        actual_truncated = actual_prices[:min_len]
        
        abs_diffs = np.abs(np.array(pred_truncated) - np.array(actual_truncated))
        mean_abs_diff = np.mean(abs_diffs)
        reward = calculate_price_difference_reward(pred_truncated, actual_truncated)
        
        print(f"\nMean absolute difference: ${mean_abs_diff:.2f}")
        print(f"Reward: {reward:.4f}")