# Unsloth GRPO Training for Bitcoin Investment Advisory

This notebook implements Group Relative Policy Optimization (GRPO) using Unsloth for Bitcoin investment advisory.

**Dataset**: `bitcoin-investment-advisory-dataset`

**Training Method**: Unsloth GRPO
- Built-in preference learning optimization
- Efficient memory usage with Unsloth
- Streamlined training pipeline for investment advisory

## Install Libraries

In [None]:
# !pip install -U "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# !pip install -U xformers trl peft accelerate bitsandbytes

## Imports

In [None]:
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template
from trl import GRPOTrainer, GRPOConfig
from datasets import load_dataset, Dataset
from transformers import TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel
import torch, random, os
import json
import numpy as np
from datetime import datetime

SEED = 42
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

## Configuration

In [None]:
# Model configuration
BASE_MODEL_NAME = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit"  # Base model
ADAPTER_PATH = "tahamajs/my-awesome-model_final_for_explanation_end_summerized"  # Pre-trained adapter
CHECKPOINT = "checkpoint-800"  # Specific checkpoint
MAX_SEQ_LENGTH = 2048
DTYPE = None  # Auto-detection
LOAD_IN_4BIT = True

# LoRA configuration
LORA_R = 16
LORA_ALPHA = 16
LORA_DROPOUT = 0.0
TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

# GRPO configuration
OUTPUT_DIR = "./qwen_bitcoin_advisory_grpo_unsloth_pretrained"
LEARNING_RATE = 3e-7  # Lower for pre-trained model
NUM_TRAIN_EPOCHS = 1
PER_DEVICE_TRAIN_BATCH_SIZE = 1
GRADIENT_ACCUMULATION_STEPS = 8
MAX_LENGTH = 1024
MAX_PROMPT_LENGTH = 512
BETA = 0.1

# Dataset
DATASET_NAME = "tahamajs/bitcoin-investment-advisory-dataset"

# Reward model for financial advisory
REWARD_MODEL_NAME = "ProsusAI/finbert"  # Specialized for financial text analysis

## Load Model and Tokenizer

In [None]:
# Load base model
print(f"🔄 Loading base model: {BASE_MODEL_NAME}")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit=LOAD_IN_4BIT,
)

# Load pre-trained adapter
print(f"🔄 Loading pre-trained adapter: {ADAPTER_PATH}/{CHECKPOINT}")
try:
    adapter_path = f"{ADAPTER_PATH}/{CHECKPOINT}"
    model = PeftModel.from_pretrained(model, adapter_path)
    print(f"✅ Successfully loaded adapter from {adapter_path}")
except Exception as e:
    print(f"⚠️ Could not load adapter, using base model: {e}")

# Apply chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template="chatml",  # Supports Qwen models
)

# Prepare model for PEFT training
model = FastLanguageModel.get_peft_model(
    model,
    r=LORA_R,
    target_modules=TARGET_MODULES,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=SEED,
    use_rslora=False,
    loftq_config=None,
)

# Load reward model for financial advisory
print(f"\n🔄 Loading reward model: {REWARD_MODEL_NAME}")
try:
    reward_tokenizer = AutoTokenizer.from_pretrained(REWARD_MODEL_NAME)
    reward_model = AutoModelForSequenceClassification.from_pretrained(REWARD_MODEL_NAME)
    reward_model.eval()
    print(f"✅ Reward model loaded successfully")
except Exception as e:
    print(f"⚠️ Could not load reward model, using rule-based rewards: {e}")
    reward_model = None
    reward_tokenizer = None

print(f"\n📊 Model Configuration:")
print(f"  Base model: {BASE_MODEL_NAME}")
print(f"  Pre-trained adapter: {ADAPTER_PATH}/{CHECKPOINT}")
print(f"  Max sequence length: {MAX_SEQ_LENGTH}")
print(f"  LoRA rank: {LORA_R}")
print(f"  Load in 4bit: {LOAD_IN_4BIT}")
print(f"  Reward model: {REWARD_MODEL_NAME if reward_model else 'Rule-based only'}")

## Load and Prepare Dataset

In [None]:
# Load dataset
dataset = load_dataset(DATASET_NAME, split="train")
print(f"Dataset loaded: {DATASET_NAME}")
print(f"Total samples: {len(dataset):,}")

# Show sample
print("\n=== Sample Data ===")
sample = dataset[0]
for key, value in sample.items():
    print(f"{key}: {str(value)[:150]}{'...' if len(str(value)) > 150 else ''}")

## Format Dataset for GRPO

In [None]:
def formatting_prompts_func(examples):
    """
    Format examples for Unsloth GRPO training.
    Creates conversational format with system, user, and assistant messages.
    """
    instructions = examples.get("instruction", [""] * len(examples.get("input", [])))
    inputs = examples.get("input", [])
    outputs = examples.get("output", [])
    
    conversations = []
    for instruction, user_input, output in zip(instructions, inputs, outputs):
        conversation = [
            {"role": "system", "content": instruction or "You are a professional Bitcoin investment advisor."},
            {"role": "user", "content": user_input or ""},
            {"role": "assistant", "content": output or ""},
        ]
        conversations.append(conversation)
    
    return {"conversations": conversations}

# Format dataset
formatted_dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=dataset.column_names
)

print(f"Formatted dataset: {len(formatted_dataset):,} samples")

# Show formatted sample
print("\n=== Formatted Sample ===")
sample_conv = formatted_dataset[0]["conversations"]
for msg in sample_conv:
    print(f"**{msg['role'].upper()}**: {msg['content'][:200]}{'...' if len(msg['content']) > 200 else ''}")
    print()

In [None]:
training# Enhanced Reward Function for Bitcoin Investment Advisory
def calculate_investment_advisory_reward(response, ground_truth, reward_model=None, reward_tokenizer=None):
    """
    Comprehensive reward function for Bitcoin investment advisory responses.
    Combines multiple scoring mechanisms for robust evaluation.
    """
    total_reward = 0.0
    
    # 1. Rule-based Investment Quality Assessment (40% weight)
    investment_reward = 0.0
    
    # Length appropriateness (investment advice should be detailed but not too verbose)
    response_len = len(response)
    if 100 <= response_len <= 800:
        investment_reward += 0.15
    elif 50 <= response_len <= 1200:
        investment_reward += 0.10
    elif response_len < 50:
        investment_reward -= 0.10  # Too short for investment advice
    
    # Investment-specific keywords and concepts
    investment_keywords = {
        'strategy': 0.08, 'portfolio': 0.08, 'risk': 0.10, 'diversification': 0.05,
        'bitcoin': 0.05, 'cryptocurrency': 0.05, 'investment': 0.08, 'allocation': 0.06,
        'market': 0.05, 'analysis': 0.06, 'recommendation': 0.08, 'advice': 0.05,
        'hodl': 0.04, 'dca': 0.06, 'volatility': 0.05, 'return': 0.05
    }
    
    response_lower = response.lower()
    for keyword, weight in investment_keywords.items():
        if keyword in response_lower:
            investment_reward += weight
    
    # Risk management indicators (crucial for investment advice)
    risk_indicators = ['risk tolerance', 'risk management', 'stop loss', 'risk assessment', 
                      'conservative', 'aggressive', 'moderate risk', 'diversify', 'hedge']
    risk_score = sum(0.05 for indicator in risk_indicators if indicator in response_lower)
    investment_reward += min(0.20, risk_score)
    
    # Financial metrics and numbers (good investment advice includes specifics)
    import re
    percentage_matches = re.findall(r'\d+\.?\d*%', response)
    dollar_matches = re.findall(r'\$\d+', response)
    if percentage_matches or dollar_matches:
        investment_reward += 0.10
    
    # Investment timeframe mentions
    timeframes = ['short-term', 'long-term', 'monthly', 'yearly', 'quarterly', 'daily', 
                 'weeks', 'months', 'years']
    if any(timeframe in response_lower for timeframe in timeframes):
        investment_reward += 0.08
    
    # Structured advice format
    if any(marker in response for marker in ['1.', '2.', '•', '-', 'Step']):
        investment_reward += 0.10
    
    total_reward += investment_reward * 0.4
    
    # 2. Sentiment and Tone Analysis (20% weight)
    sentiment_reward = 0.0
    
    # Professional tone indicators
    professional_phrases = ['recommend', 'suggest', 'consider', 'analysis shows', 'based on',
                          'in my opinion', 'professional advice', 'financial guidance']
    professional_score = sum(0.03 for phrase in professional_phrases if phrase in response_lower)
    sentiment_reward += min(0.15, professional_score)
    
    # Balanced perspective (good investment advice shows multiple viewpoints)
    balance_indicators = ['however', 'on the other hand', 'alternatively', 'but', 'although',
                         'pros and cons', 'advantages', 'disadvantages', 'benefits', 'risks']
    balance_score = sum(0.02 for indicator in balance_indicators if indicator in response_lower)
    sentiment_reward += min(0.10, balance_score)
    
    total_reward += sentiment_reward * 0.2
    
    # 3. Content Similarity with Ground Truth (25% weight)
    similarity_reward = 0.0
    
    # Token overlap
    response_tokens = set(response.lower().split())
    gt_tokens = set(ground_truth.lower().split())
    
    if len(gt_tokens) > 0:
        jaccard_similarity = len(response_tokens & gt_tokens) / len(response_tokens | gt_tokens)
        similarity_reward += jaccard_similarity * 0.6
        
        # Key concept alignment
        key_concepts = ['buy', 'sell', 'hold', 'accumulate', 'wait', 'caution', 'bullish', 'bearish']
        concept_matches = sum(1 for concept in key_concepts 
                            if concept in response_lower and concept in ground_truth.lower())
        if concept_matches > 0:
            similarity_reward += min(0.4, concept_matches * 0.1)
    
    total_reward += similarity_reward * 0.25
    
    # 4. AI Reward Model Assessment (15% weight)
    if reward_model is not None and reward_tokenizer is not None:
        try:
            # Format input for financial sentiment analysis
            model_input = f"Investment Advice: {response[:400]}"  # Limit length for model
            
            inputs = reward_tokenizer(
                model_input, 
                return_tensors="pt", 
                truncation=True, 
                max_length=512,
                padding=True
            )
            
            if torch.cuda.is_available():
                inputs = {k: v.cuda() for k, v in inputs.items()}
                reward_model = reward_model.cuda()
            
            with torch.no_grad():
                outputs = reward_model(**inputs)
                # For FinBERT, we want positive financial sentiment
                logits = outputs.logits
                probabilities = torch.softmax(logits, dim=-1)
                
                # FinBERT typically has [negative, neutral, positive] classes
                if probabilities.shape[-1] == 3:
                    positive_score = probabilities[0, 2].item()  # Positive class
                    neutral_score = probabilities[0, 1].item()   # Neutral class
                    # Weight positive sentiment higher for investment advice
                    ai_reward = positive_score * 0.8 + neutral_score * 0.2
                else:
                    ai_reward = torch.sigmoid(logits).mean().item()
                
                total_reward += ai_reward * 0.15
                
        except Exception as e:
            # Fallback: use rule-based scoring only
            print(f"⚠️ Reward model error: {e}")
            pass
    
    # 5. Penalty for Poor Investment Advice (-0.5 to 0 weight)
    penalties = 0.0
    
    # Penalty for overly speculative language
    speculative_words = ['guaranteed', 'definitely will', 'certain profit', 'no risk', 'sure thing']
    speculation_penalty = sum(0.1 for word in speculative_words if word in response_lower)
    penalties -= min(0.3, speculation_penalty)
    
    # Penalty for financial advice disclaimers absence (good practice to include them)
    disclaimer_phrases = ['not financial advice', 'dyor', 'do your own research', 'consult', 'disclaimer']
    if not any(phrase in response_lower for phrase in disclaimer_phrases) and len(response) > 200:
        penalties -= 0.05
    
    total_reward += penalties
    
    # Ensure reward is in reasonable range [0, 1]
    total_reward = max(0.0, min(1.0, total_reward))
    
    return total_reward

print("✅ Enhanced investment advisory reward function created")

In [None]:
# Custom GRPO Trainer with Enhanced Reward Integration
class CustomGRPOTrainer(GRPOTrainer):
    """
    Custom GRPO trainer that integrates our enhanced reward function
    for Bitcoin investment advisory training.
    """
    
    def __init__(self, reward_model=None, reward_tokenizer=None, **kwargs):
        super().__init__(**kwargs)
        self.reward_model = reward_model
        self.reward_tokenizer = reward_tokenizer
        
    def compute_rewards(self, queries, responses, ground_truths=None):
        """
        Compute rewards using our enhanced investment advisory reward function.
        """
        rewards = []
        
        for i, (query_responses) in enumerate(responses):
            query_rewards = []
            ground_truth = ground_truths[i] if ground_truths and i < len(ground_truths) else ""
            
            for response in query_responses:
                # Use our enhanced reward function
                reward = calculate_investment_advisory_reward(
                    response=response,
                    ground_truth=ground_truth,
                    reward_model=self.reward_model,
                    reward_tokenizer=self.reward_tokenizer
                )
                query_rewards.append(reward)
            
            rewards.append(query_rewards)
        
        return rewards
    
    def generate_completions(self, queries, num_completions=4):
        """
        Generate multiple completions for each query using the model.
        """
        all_completions = []
        
        for query in queries:
            completions = []
            
            # Tokenize the query
            inputs = self.tokenizer(
                query, 
                return_tensors="pt", 
                truncation=True, 
                max_length=self.max_prompt_length
            ).to(self.model.device)
            
            for _ in range(num_completions):
                with torch.no_grad():
                    outputs = self.model.generate(
                        **inputs,
                        max_new_tokens=min(512, self.max_length - inputs['input_ids'].shape[1]),
                        temperature=0.8,
                        do_sample=True,
                        pad_token_id=self.tokenizer.eos_token_id,
                        eos_token_id=self.tokenizer.eos_token_id,
                        repetition_penalty=1.1,
                    )
                
                # Decode the generated response
                generated_ids = outputs[0][inputs['input_ids'].shape[1]:]
                response = self.tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
                completions.append(response)
            
            all_completions.append(completions)
        
        return all_completions

print("✅ Custom GRPO trainer with enhanced rewards created")

## Setup GRPO Training

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    logging_steps=10,
    save_steps=100,
    save_strategy="steps",
    evaluation_strategy="no",
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    optim="adamw_8bit",
    weight_decay=0.01,
    max_grad_norm=1.0,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    remove_unused_columns=False,
    dataloader_num_workers=2,
    seed=SEED,
    report_to="none",
)

print(f"🎯 Training Configuration:")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Epochs: {NUM_TRAIN_EPOCHS}")
print(f"Batch size: {PER_DEVICE_TRAIN_BATCH_SIZE}")
print(f"Gradient accumulation: {GRADIENT_ACCUMULATION_STEPS}")
print(f"Max length: {MAX_LENGTH}")
print(f"Max prompt length: {MAX_PROMPT_LENGTH}")
print(f"Beta (KL penalty): {BETA}")
print(f"Using {'bfloat16' if is_bfloat16_supported() else 'float16'}")

## Initialize GRPO Trainer

In [None]:
# Initialize GRPO trainer
grpo_trainer = GRPOTrainer(
    model=model,
    args=training_args,
    train_dataset=formatted_dataset,
    tokenizer=tokenizer,
    max_length=MAX_LENGTH,
    max_prompt_length=MAX_PROMPT_LENGTH,
    beta=BETA,
    formatting_func=formatting_prompts_func,
)

print("✅ Unsloth GRPO Trainer initialized successfully!")
print(f"Model class: {model.__class__.__name__}")
print(f"Trainer class: {grpo_trainer.__class__.__name__}")
print(f"Training samples: {len(formatted_dataset):,}")
print(f"Effective batch size: {PER_DEVICE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")

## Start GRPO Training

In [None]:
# Start training
print("🚀 Starting Unsloth GRPO Training for Investment Advisory...")
print(f"Training {NUM_TRAIN_EPOCHS} epoch(s) on {len(formatted_dataset):,} samples")
print("="*60)

# Record start time
start_time = datetime.now()
print(f"Training started at: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")

# Train the model
trainer_stats = grpo_trainer.train()

# Record end time
end_time = datetime.now()
training_duration = end_time - start_time

print("\n" + "="*60)
print("🎉 GRPO Training Completed!")
print(f"Training finished at: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Total training time: {training_duration}")
print(f"Final training loss: {trainer_stats.training_loss:.4f}")
print(f"Training steps: {trainer_stats.global_step}")

## Save Model

In [None]:
# Save the final model
print("💾 Saving trained model...")

# Save model and tokenizer
model.save_pretrained(f"{OUTPUT_DIR}/final_model")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/final_model")

print(f"✅ Model saved to: {OUTPUT_DIR}/final_model")

# Save training summary
training_summary = {
    "model_name": MODEL_NAME,
    "dataset": DATASET_NAME,
    "training_method": "Unsloth GRPO for Investment Advisory",
    "total_samples": len(formatted_dataset),
    "training_config": {
        "epochs": NUM_TRAIN_EPOCHS,
        "learning_rate": LEARNING_RATE,
        "batch_size": PER_DEVICE_TRAIN_BATCH_SIZE,
        "gradient_accumulation_steps": GRADIENT_ACCUMULATION_STEPS,
        "max_length": MAX_LENGTH,
        "max_prompt_length": MAX_PROMPT_LENGTH,
        "beta": BETA,
    },
    "training_results": {
        "final_loss": trainer_stats.training_loss,
        "total_steps": trainer_stats.global_step,
        "training_duration": str(training_duration),
    },
    "timestamps": {
        "start_time": start_time.isoformat(),
        "end_time": end_time.isoformat(),
    },
    "model_path": f"{OUTPUT_DIR}/final_model",
}

# Save summary
with open(f"{OUTPUT_DIR}/training_summary.json", "w") as f:
    json.dump(training_summary, f, indent=2)

print(f"Training summary saved to: {OUTPUT_DIR}/training_summary.json")

## Test Trained Model

In [None]:
# Test the trained model
print("🧪 Testing the trained GRPO model...")

# Prepare model for inference
FastLanguageModel.for_inference(model)

# Test sample for investment advisory
test_messages = [
    {"role": "system", "content": "You are a professional Bitcoin investment advisor. Provide strategic investment advice based on market analysis."},
    {"role": "user", "content": "Given the current market conditions with Bitcoin at $45,000 and recent institutional adoption trends, what would be your investment recommendation for a moderate risk tolerance portfolio?"}
]

# Format with chat template
test_prompt = tokenizer.apply_chat_template(
    test_messages,
    tokenize=False,
    add_generation_prompt=True
)

print("Test prompt:")
print(test_prompt)
print("\n" + "="*50)

# Generate response
inputs = tokenizer(test_prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

# Decode response
response = tokenizer.decode(
    outputs[0][len(inputs.input_ids[0]):],
    skip_special_tokens=True
)

print("Model Response:")
print(response)
print("\n✅ Model testing completed!")

## Training Summary

In [None]:
print("📊 Unsloth GRPO Training Summary - Bitcoin Investment Advisory")
print("=" * 60)
print(f"🤖 Model: {MODEL_NAME}")
print(f"📚 Dataset: {DATASET_NAME}")
print(f"📈 Training method: Unsloth GRPO for Investment Advisory")
print(f"📝 Total samples: {len(formatted_dataset):,}")
print()
print("🎯 Training Configuration:")
print(f"  • Epochs: {NUM_TRAIN_EPOCHS}")
print(f"  • Learning rate: {LEARNING_RATE}")
print(f"  • Batch size: {PER_DEVICE_TRAIN_BATCH_SIZE}")
print(f"  • Gradient accumulation: {GRADIENT_ACCUMULATION_STEPS}")
print(f"  • Effective batch size: {PER_DEVICE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
print(f"  • Max length: {MAX_LENGTH}")
print(f"  • Max prompt length: {MAX_PROMPT_LENGTH}")
print(f"  • Beta (KL penalty): {BETA}")
print(f"  • LoRA rank: {LORA_R}")
print()
print("📊 Training Results:")
print(f"  • Final loss: {trainer_stats.training_loss:.4f}")
print(f"  • Training steps: {trainer_stats.global_step:,}")
print(f"  • Training duration: {training_duration}")
print()
print("💾 Outputs:")
print(f"  • Model saved to: {OUTPUT_DIR}/final_model")
print(f"  • Summary saved to: {OUTPUT_DIR}/training_summary.json")
print()
print("🔬 Key Features for Investment Advisory:")
print("  ✅ Unsloth-optimized GRPO training")
print("  ✅ Bitcoin investment strategy generation")
print("  ✅ Risk assessment and management")
print("  ✅ Portfolio optimization advice")
print("  ✅ Market analysis and insights")
print("  ✅ Preference learning for advice quality")
print("  ✅ Memory-efficient training")
print()
print("🎉 Unsloth GRPO training completed successfully!")
print("💰 Model ready for Bitcoin investment advisory tasks!")