# Unsloth GRPO Training for Bitcoin Enhanced Prediction

This notebook implements Group Relative Policy Optimization (GRPO) using Unsloth for comprehensive Bitcoin prediction.

**Dataset**: `bitcoin-enhanced-prediction-dataset-with-local-comprehensive-news`

**Training Method**: Unsloth GRPO
- Built-in preference learning optimization
- Efficient memory usage with Unsloth
- Streamlined training pipeline

## Install Libraries

In [None]:
# !pip install -U "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# !pip install -U xformers trl peft accelerate bitsandbytes

## Imports

In [None]:
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template
from trl import GRPOTrainer, GRPOConfig
from datasets import load_dataset, Dataset
from transformers import TrainingArguments, AutoTokenizer, AutoModel
from peft import PeftModel
import torch, random, os
import json
import numpy as np
from datetime import datetime

SEED = 42
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

## Configuration

In [None]:
# Model configuration
BASE_MODEL_NAME = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit"  # Base model
ADAPTER_PATH = "tahamajs/my-awesome-model_final_bitcoin_enhanced_prediction_dataset_with_local_comprehensive_news"  # Pre-trained adapter
CHECKPOINT = "checkpoint-1152"  # Specific checkpoint
MAX_SEQ_LENGTH = 2048
DTYPE = None  # Auto-detection
LOAD_IN_4BIT = True

# LoRA configuration
LORA_R = 16
LORA_ALPHA = 16
LORA_DROPOUT = 0.0
TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

# GRPO configuration
OUTPUT_DIR = "./qwen_bitcoin_enhanced_grpo_unsloth_pretrained"
LEARNING_RATE = 3e-7  # Lower for pre-trained model
NUM_TRAIN_EPOCHS = 1
PER_DEVICE_TRAIN_BATCH_SIZE = 1
GRADIENT_ACCUMULATION_STEPS = 8
MAX_LENGTH = 1024
MAX_PROMPT_LENGTH = 512
BETA = 0.1

# Dataset
DATASET_NAME = "tahamajs/bitcoin-enhanced-prediction-dataset-with-local-comprehensive-news"

# Reward model for comprehensive analysis
REWARD_MODEL_NAME = "microsoft/DialoGPT-medium"  # Good for conversational quality assessment

## Load Model and Tokenizer

In [None]:
# Load base model
print(f"🔄 Loading base model: {BASE_MODEL_NAME}")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit=LOAD_IN_4BIT,
)

# Load pre-trained adapter
print(f"🔄 Loading pre-trained adapter: {ADAPTER_PATH}/{CHECKPOINT}")
try:
    adapter_path = f"{ADAPTER_PATH}/{CHECKPOINT}"
    model = PeftModel.from_pretrained(model, adapter_path)
    print(f"✅ Successfully loaded adapter from {adapter_path}")
except Exception as e:
    print(f"⚠️ Could not load adapter, using base model: {e}")

# Apply chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template="chatml",  # Supports Qwen models
)

# Prepare model for PEFT training
model = FastLanguageModel.get_peft_model(
    model,
    r=LORA_R,
    target_modules=TARGET_MODULES,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=SEED,
    use_rslora=False,
    loftq_config=None,
)

# Load reward model for comprehensive analysis
print(f"\n🔄 Loading reward model: {REWARD_MODEL_NAME}")
try:
    reward_tokenizer = AutoTokenizer.from_pretrained(REWARD_MODEL_NAME)
    reward_model = AutoModel.from_pretrained(REWARD_MODEL_NAME)
    reward_model.eval()
    print(f"✅ Reward model loaded successfully")
except Exception as e:
    print(f"⚠️ Could not load reward model, using rule-based rewards: {e}")
    reward_model = None
    reward_tokenizer = None

print(f"\n📊 Model Configuration:")
print(f"  Base model: {BASE_MODEL_NAME}")
print(f"  Pre-trained adapter: {ADAPTER_PATH}/{CHECKPOINT}")
print(f"  Max sequence length: {MAX_SEQ_LENGTH}")
print(f"  LoRA rank: {LORA_R}")
print(f"  Load in 4bit: {LOAD_IN_4BIT}")
print(f"  Reward model: {REWARD_MODEL_NAME if reward_model else 'Rule-based only'}")

## Load and Prepare Dataset

In [None]:
# Load dataset
dataset = load_dataset(DATASET_NAME, split="train")
print(f"Dataset loaded: {DATASET_NAME}")
print(f"Total samples: {len(dataset):,}")

# Show sample
print("\n=== Sample Data ===")
sample = dataset[0]
for key, value in sample.items():
    print(f"{key}: {str(value)[:150]}{'...' if len(str(value)) > 150 else ''}")

## Format Dataset for GRPO

In [None]:
def formatting_prompts_func(examples):
    """
    Format examples for Unsloth GRPO training.
    Creates conversational format with system, user, and assistant messages.
    """
    instructions = examples.get("instruction", [""] * len(examples.get("input", [])))
    inputs = examples.get("input", [])
    outputs = examples.get("output", [])
    
    conversations = []
    for instruction, user_input, output in zip(instructions, inputs, outputs):
        conversation = [
            {"role": "system", "content": instruction or "You are a helpful Bitcoin market analyst."},
            {"role": "user", "content": user_input or ""},
            {"role": "assistant", "content": output or ""},
        ]
        conversations.append(conversation)
    
    return {"conversations": conversations}

# Format dataset
formatted_dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=dataset.column_names
)

print(f"Formatted dataset: {len(formatted_dataset):,} samples")

# Show formatted sample
print("\n=== Formatted Sample ===")
sample_conv = formatted_dataset[0]["conversations"]
for msg in sample_conv:
    print(f"**{msg['role'].upper()}**: {msg['content'][:200]}{'...' if len(msg['content']) > 200 else ''}")
    print()

In [None]:
# Enhanced Reward Function for Bitcoin Comprehensive Prediction
def calculate_comprehensive_prediction_reward(response, ground_truth, reward_model=None, reward_tokenizer=None):
    """
    Comprehensive reward function for Bitcoin prediction with news and market data.
    Evaluates prediction accuracy, reasoning quality, and comprehensive analysis.
    """
    total_reward = 0.0
    
    # 1. Rule-based Prediction Quality Assessment (30% weight)
    prediction_reward = 0.0
    
    # Length appropriateness (comprehensive predictions need detail)
    response_len = len(response)
    if 150 <= response_len <= 1000:
        prediction_reward += 0.15
    elif 100 <= response_len <= 1200:
        prediction_reward += 0.10
    elif response_len < 100:
        prediction_reward -= 0.05  # Too brief for comprehensive analysis
    
    # Comprehensive analysis keywords
    prediction_keywords = {
        'prediction': 0.08, 'forecast': 0.08, 'analysis': 0.06, 'market': 0.06,
        'bitcoin': 0.05, 'btc': 0.04, 'price': 0.07, 'trend': 0.06,
        'technical': 0.05, 'fundamental': 0.05, 'news': 0.05, 'data': 0.04,
        'indicators': 0.05, 'volume': 0.04, 'resistance': 0.04, 'support': 0.04,
        'momentum': 0.04, 'correlation': 0.03, 'pattern': 0.04, 'signal': 0.04
    }
    
    response_lower = response.lower()
    for keyword, weight in prediction_keywords.items():
        if keyword in response_lower:
            prediction_reward += weight
    
    total_reward += prediction_reward * 0.30
    
    # 2. Technical Analysis Integration (20% weight)
    technical_reward = 0.0
    
    # Technical analysis terms
    technical_terms = ['rsi', 'macd', 'moving average', 'bollinger bands', 'fibonacci',
                      'support level', 'resistance level', 'breakout', 'consolidation',
                      'overbought', 'oversold', 'bullish divergence', 'bearish divergence']
    tech_score = sum(0.03 for term in technical_terms if term in response_lower)
    technical_reward += min(0.15, tech_score)
    
    # Chart patterns
    pattern_terms = ['head and shoulders', 'double top', 'double bottom', 'triangle',
                    'flag', 'pennant', 'cup and handle', 'ascending', 'descending']
    pattern_score = sum(0.02 for term in pattern_terms if term in response_lower)
    technical_reward += min(0.08, pattern_score)
    
    total_reward += technical_reward * 0.20
    
    # 3. News Impact Integration (20% weight)
    news_reward = 0.0
    
    # News integration indicators
    news_terms = ['news impact', 'market sentiment', 'adoption news', 'regulatory',
                 'institutional', 'whale activity', 'exchange', 'etf', 'mainstream',
                 'correlation with', 'influenced by', 'driven by news']
    news_score = sum(0.03 for term in news_terms if term in response_lower)
    news_reward += min(0.12, news_score)
    
    # Multi-factor analysis
    factor_terms = ['multiple factors', 'combined with', 'along with', 'considering',
                   'taking into account', '综合考虑', 'holistic', 'comprehensive']
    if any(term in response_lower for term in factor_terms):
        news_reward += 0.08
    
    total_reward += news_reward * 0.20
    
    # 4. Prediction Specificity and Confidence (15% weight)
    specificity_reward = 0.0
    
    # Price targets and timeframes
    import re
    price_targets = re.findall(r'\$\d+[,\d]*', response)
    percentage_targets = re.findall(r'\d+\.?\d*%', response)
    
    if price_targets or percentage_targets:
        specificity_reward += 0.10
    
    # Timeframe specificity
    timeframes = ['within.*day', 'next.*week', 'coming.*month', 'short.*term', 
                 'long.*term', 'by.*end', 'Q[1-4]', 'next.*year']
    timeframe_mentions = sum(1 for tf in timeframes if re.search(tf, response_lower))
    if timeframe_mentions > 0:
        specificity_reward += 0.05
    
    total_reward += specificity_reward * 0.15
    
    # 5. Structured Output Parsing and Accuracy (25% weight - increased)
    structured_reward = 0.0
    
    # Parse JSON output from response
    response_json = parse_trading_output(response)
    ground_truth_json = parse_trading_output(ground_truth) if ground_truth else None
    
    if response_json:
        # Reward for valid JSON structure
        structured_reward += 0.05
        
        # Reward for required fields presence
        required_fields = ['action', 'confidence', 'stop_loss', 'take_profit', 'forecast_10d']
        present_fields = sum(1 for field in required_fields if field in response_json)
        structured_reward += (present_fields / len(required_fields)) * 0.10
        
        # Compare with ground truth if available
        if ground_truth_json:
            # Action accuracy (SELL/BUY/HOLD)
            if response_json.get('action', '').upper() == ground_truth_json.get('action', '').upper():
                structured_reward += 0.08
            
            # Confidence similarity (0-100 scale)
            resp_conf = response_json.get('confidence', 0)
            gt_conf = ground_truth_json.get('confidence', 0)
            if isinstance(resp_conf, (int, float)) and isinstance(gt_conf, (int, float)):
                conf_diff = abs(resp_conf - gt_conf)
                conf_similarity = max(0, 1 - (conf_diff / 100))  # Normalize by max difference
                structured_reward += conf_similarity * 0.06
            
            # Price level accuracy (stop_loss, take_profit)
            for price_field in ['stop_loss', 'take_profit']:
                resp_price = response_json.get(price_field, 0)
                gt_price = ground_truth_json.get(price_field, 0)
                if isinstance(resp_price, (int, float)) and isinstance(gt_price, (int, float)) and gt_price > 0:
                    price_diff_pct = abs((resp_price - gt_price) / gt_price)
                    price_similarity = max(0, 1 - price_diff_pct)  # Closer prices get higher reward
                    structured_reward += price_similarity * 0.04
            
            # Forecast accuracy (10-day prediction array)
            resp_forecast = response_json.get('forecast_10d', [])
            gt_forecast = ground_truth_json.get('forecast_10d', [])
            if isinstance(resp_forecast, list) and isinstance(gt_forecast, list) and len(gt_forecast) > 0:
                forecast_similarity = calculate_forecast_similarity(resp_forecast, gt_forecast)
                structured_reward += forecast_similarity * 0.08
        
        # Reward for realistic values
        if 'confidence' in response_json:
            conf = response_json.get('confidence', 0)
            if isinstance(conf, (int, float)) and 0 <= conf <= 100:
                structured_reward += 0.02
        
        # Reward for reasonable price levels
        for price_field in ['stop_loss', 'take_profit']:
            price = response_json.get(price_field, 0)
            if isinstance(price, (int, float)) and price > 0:
                structured_reward += 0.01
    
    # Fallback: Basic text similarity if no structured output
    else:
        if ground_truth:
            response_tokens = set(response.lower().split())
            gt_tokens = set(ground_truth.lower().split())
            if len(gt_tokens) > 0:
                jaccard_similarity = len(response_tokens & gt_tokens) / len(response_tokens | gt_tokens)
                structured_reward += jaccard_similarity * 0.15
    
    total_reward += structured_reward * 0.25
    
    # 6. AI Reward Model Assessment (5% weight)
    if reward_model is not None and reward_tokenizer is not None:
        try:
            # Format input for conversational quality assessment
            model_input = f"Bitcoin Prediction Analysis: {response[:400]}"
            
            inputs = reward_tokenizer(
                model_input, 
                return_tensors="pt", 
                truncation=True, 
                max_length=512,
                padding=True
            )
            
            if torch.cuda.is_available():
                inputs = {k: v.cuda() for k, v in inputs.items()}
                reward_model = reward_model.cuda()
            
            with torch.no_grad():
                outputs = reward_model(**inputs)
                
                # For conversational models, we want coherent responses
                if hasattr(outputs, 'logits'):
                    logits = outputs.logits
                    # Simple coherence score based on model confidence
                    ai_reward = torch.sigmoid(logits.mean()).item()
                else:
                    # Fallback for different model types
                    ai_reward = 0.5
                
                total_reward += ai_reward * 0.05
                
        except Exception as e:
            # Fallback: no penalty, just skip AI reward
            pass
    
    # 7. Quality and Structure Bonuses
    structure_bonus = 0.0
    
    # Well-structured analysis
    if any(marker in response for marker in ['1.', '2.', '3.', '•', '-', 'Analysis:', 'Prediction:', 'Factors:']):
        structure_bonus += 0.06
    
    # Risk disclaimers (good practice for predictions)
    disclaimer_terms = ['not financial advice', 'high risk', 'volatile', 'dyor', 'past performance']
    if any(term in response_lower for term in disclaimer_terms):
        structure_bonus += 0.04
    
    total_reward += structure_bonus
    
    # 8. Penalties for Poor Predictions
    penalties = 0.0
    
    # Penalty for overconfident predictions without reasoning
    overconfident_terms = ['guaranteed', 'definitely will', 'certain', '100%', 'no doubt']
    confidence_penalty = sum(0.03 for term in overconfident_terms if term in response_lower)
    penalties -= min(0.15, confidence_penalty)
    
    # Penalty for vague predictions
    vague_terms = ['maybe', 'might', 'could be', 'possibly', 'perhaps']
    vague_count = sum(1 for term in vague_terms if term in response_lower)
    if vague_count > 2:  # Too many hedge words
        penalties -= 0.05
    
    total_reward += penalties
    
    # Ensure reward is in reasonable range [0, 1]
    total_reward = max(0.0, min(1.0, total_reward))
    
    return total_reward

print("✅ Enhanced comprehensive prediction reward function created")

## Setup GRPO Training

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    logging_steps=10,
    save_steps=100,
    save_strategy="steps",
    evaluation_strategy="no",
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    optim="adamw_8bit",
    weight_decay=0.01,
    max_grad_norm=1.0,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    remove_unused_columns=False,
    dataloader_num_workers=2,
    seed=SEED,
    report_to="none",
)

print(f"🎯 Training Configuration:")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Epochs: {NUM_TRAIN_EPOCHS}")
print(f"Batch size: {PER_DEVICE_TRAIN_BATCH_SIZE}")
print(f"Gradient accumulation: {GRADIENT_ACCUMULATION_STEPS}")
print(f"Max length: {MAX_LENGTH}")
print(f"Max prompt length: {MAX_PROMPT_LENGTH}")
print(f"Beta (KL penalty): {BETA}")
print(f"Using {'bfloat16' if is_bfloat16_supported() else 'float16'}")

## Initialize GRPO Trainer

In [None]:
# Initialize GRPO trainer
grpo_trainer = GRPOTrainer(
    model=model,
    args=training_args,
    train_dataset=formatted_dataset,
    tokenizer=tokenizer,
    max_length=MAX_LENGTH,
    max_prompt_length=MAX_PROMPT_LENGTH,
    beta=BETA,
    formatting_func=formatting_prompts_func,
)

print("✅ Unsloth GRPO Trainer initialized successfully!")
print(f"Model class: {model.__class__.__name__}")
print(f"Trainer class: {grpo_trainer.__class__.__name__}")
print(f"Training samples: {len(formatted_dataset):,}")
print(f"Effective batch size: {PER_DEVICE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")

## Start GRPO Training

In [None]:
# Start training
print("🚀 Starting Unsloth GRPO Training...")
print(f"Training {NUM_TRAIN_EPOCHS} epoch(s) on {len(formatted_dataset):,} samples")
print("="*60)

# Record start time
start_time = datetime.now()
print(f"Training started at: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")

# Train the model
trainer_stats = grpo_trainer.train()

# Record end time
end_time = datetime.now()
training_duration = end_time - start_time

print("\n" + "="*60)
print("🎉 GRPO Training Completed!")
print(f"Training finished at: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Total training time: {training_duration}")
print(f"Final training loss: {trainer_stats.training_loss:.4f}")
print(f"Training steps: {trainer_stats.global_step}")

## Save Model

In [None]:
# Save the final model
print("💾 Saving trained model...")

# Save model and tokenizer
model.save_pretrained(f"{OUTPUT_DIR}/final_model")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/final_model")

print(f"✅ Model saved to: {OUTPUT_DIR}/final_model")

# Save training summary
training_summary = {
    "model_name": MODEL_NAME,
    "dataset": DATASET_NAME,
    "training_method": "Unsloth GRPO",
    "total_samples": len(formatted_dataset),
    "training_config": {
        "epochs": NUM_TRAIN_EPOCHS,
        "learning_rate": LEARNING_RATE,
        "batch_size": PER_DEVICE_TRAIN_BATCH_SIZE,
        "gradient_accumulation_steps": GRADIENT_ACCUMULATION_STEPS,
        "max_length": MAX_LENGTH,
        "max_prompt_length": MAX_PROMPT_LENGTH,
        "beta": BETA,
    },
    "training_results": {
        "final_loss": trainer_stats.training_loss,
        "total_steps": trainer_stats.global_step,
        "training_duration": str(training_duration),
    },
    "timestamps": {
        "start_time": start_time.isoformat(),
        "end_time": end_time.isoformat(),
    },
    "model_path": f"{OUTPUT_DIR}/final_model",
}

# Save summary
with open(f"{OUTPUT_DIR}/training_summary.json", "w") as f:
    json.dump(training_summary, f, indent=2)

print(f"Training summary saved to: {OUTPUT_DIR}/training_summary.json")

## Test Trained Model

In [None]:
# Test the trained model
print("🧪 Testing the trained GRPO model...")

# Prepare model for inference
FastLanguageModel.for_inference(model)

# Test sample
test_messages = [
    {"role": "system", "content": "You are an expert Bitcoin market analyst. Provide accurate and insightful analysis."},
    {"role": "user", "content": "Based on recent market trends and news, what is your Bitcoin price prediction for the next week? Please provide detailed analysis."}
]

# Format with chat template
test_prompt = tokenizer.apply_chat_template(
    test_messages,
    tokenize=False,
    add_generation_prompt=True
)

print("Test prompt:")
print(test_prompt)
print("\n" + "="*50)

# Generate response
inputs = tokenizer(test_prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

# Decode response
response = tokenizer.decode(
    outputs[0][len(inputs.input_ids[0]):],
    skip_special_tokens=True
)

print("Model Response:")
print(response)
print("\n✅ Model testing completed!")

## Training Summary

In [None]:
print("📊 Unsloth GRPO Training Summary")
print("=" * 50)
print(f"🤖 Model: {MODEL_NAME}")
print(f"📚 Dataset: {DATASET_NAME}")
print(f"📈 Training method: Unsloth GRPO (Group Relative Policy Optimization)")
print(f"📝 Total samples: {len(formatted_dataset):,}")
print()
print("🎯 Training Configuration:")
print(f"  • Epochs: {NUM_TRAIN_EPOCHS}")
print(f"  • Learning rate: {LEARNING_RATE}")
print(f"  • Batch size: {PER_DEVICE_TRAIN_BATCH_SIZE}")
print(f"  • Gradient accumulation: {GRADIENT_ACCUMULATION_STEPS}")
print(f"  • Effective batch size: {PER_DEVICE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
print(f"  • Max length: {MAX_LENGTH}")
print(f"  • Max prompt length: {MAX_PROMPT_LENGTH}")
print(f"  • Beta (KL penalty): {BETA}")
print(f"  • LoRA rank: {LORA_R}")
print()
print("📊 Training Results:")
print(f"  • Final loss: {trainer_stats.training_loss:.4f}")
print(f"  • Training steps: {trainer_stats.global_step:,}")
print(f"  • Training duration: {training_duration}")
print()
print("💾 Outputs:")
print(f"  • Model saved to: {OUTPUT_DIR}/final_model")
print(f"  • Summary saved to: {OUTPUT_DIR}/training_summary.json")
print()
print("🔬 Key Features:")
print("  ✅ Unsloth-optimized GRPO training")
print("  ✅ Memory-efficient 4-bit quantization")
print("  ✅ LoRA parameter-efficient fine-tuning")
print("  ✅ Preference learning for Bitcoin analysis")
print("  ✅ Chat template formatting")
print("  ✅ Gradient checkpointing for memory optimization")
print()
print("🎉 Unsloth GRPO training completed successfully!")
print("📈 Model ready for Bitcoin prediction tasks!")