# Unsloth GRPO Training for Bitcoin Enhanced Prediction

This notebook implements Group Relative Policy Optimization (GRPO) using Unsloth for comprehensive Bitcoin prediction.

**Dataset**: `bitcoin-enhanced-prediction-dataset-with-local-comprehensive-news`

**Training Method**: Unsloth GRPO
- Built-in preference learning optimization
- Efficient memory usage with Unsloth
- Streamlined training pipeline

## Install Libraries

In [None]:
# !pip install -U "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# !pip install -U xformers trl peft accelerate bitsandbytes

## Imports

In [None]:
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template
from trl import GRPOTrainer, GRPOConfig
from datasets import load_dataset, Dataset
from transformers import TrainingArguments
import torch, random, os
import json
import numpy as np
from datetime import datetime

SEED = 42
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

## Configuration

In [None]:
# Model configuration
MODEL_NAME = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit"  # Or use local path: "./Qwen3-8B"
MAX_SEQ_LENGTH = 2048
DTYPE = None  # Auto-detection
LOAD_IN_4BIT = True

# LoRA configuration
LORA_R = 16
LORA_ALPHA = 16
LORA_DROPOUT = 0.0
TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

# GRPO configuration
OUTPUT_DIR = "./qwen_bitcoin_grpo_unsloth"
LEARNING_RATE = 5e-7
NUM_TRAIN_EPOCHS = 1
PER_DEVICE_TRAIN_BATCH_SIZE = 1
GRADIENT_ACCUMULATION_STEPS = 8
MAX_LENGTH = 1024
MAX_PROMPT_LENGTH = 512
BETA = 0.1

# Dataset
DATASET_NAME = "tahamajs/bitcoin-enhanced-prediction-dataset-with-local-comprehensive-news"

## Load Model and Tokenizer

In [None]:
# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit=LOAD_IN_4BIT,
)

# Apply chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template="chatml",  # Supports Qwen models
)

# Prepare model for PEFT training
model = FastLanguageModel.get_peft_model(
    model,
    r=LORA_R,
    target_modules=TARGET_MODULES,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=SEED,
    use_rslora=False,
    loftq_config=None,
)

print(f"Model: {MODEL_NAME}")
print(f"Max sequence length: {MAX_SEQ_LENGTH}")
print(f"LoRA rank: {LORA_R}")
print(f"Load in 4bit: {LOAD_IN_4BIT}")

## Load and Prepare Dataset

In [None]:
# Load dataset
dataset = load_dataset(DATASET_NAME, split="train")
print(f"Dataset loaded: {DATASET_NAME}")
print(f"Total samples: {len(dataset):,}")

# Show sample
print("\n=== Sample Data ===")
sample = dataset[0]
for key, value in sample.items():
    print(f"{key}: {str(value)[:150]}{'...' if len(str(value)) > 150 else ''}")

## Format Dataset for GRPO

In [None]:
def formatting_prompts_func(examples):
    """
    Format examples for Unsloth GRPO training.
    Creates conversational format with system, user, and assistant messages.
    """
    instructions = examples.get("instruction", [""] * len(examples.get("input", [])))
    inputs = examples.get("input", [])
    outputs = examples.get("output", [])
    
    conversations = []
    for instruction, user_input, output in zip(instructions, inputs, outputs):
        conversation = [
            {"role": "system", "content": instruction or "You are a helpful Bitcoin market analyst."},
            {"role": "user", "content": user_input or ""},
            {"role": "assistant", "content": output or ""},
        ]
        conversations.append(conversation)
    
    return {"conversations": conversations}

# Format dataset
formatted_dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=dataset.column_names
)

print(f"Formatted dataset: {len(formatted_dataset):,} samples")

# Show formatted sample
print("\n=== Formatted Sample ===")
sample_conv = formatted_dataset[0]["conversations"]
for msg in sample_conv:
    print(f"**{msg['role'].upper()}**: {msg['content'][:200]}{'...' if len(msg['content']) > 200 else ''}")
    print()

## Setup GRPO Training

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    logging_steps=10,
    save_steps=100,
    save_strategy="steps",
    evaluation_strategy="no",
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    optim="adamw_8bit",
    weight_decay=0.01,
    max_grad_norm=1.0,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    remove_unused_columns=False,
    dataloader_num_workers=2,
    seed=SEED,
    report_to="none",
)

print(f"🎯 Training Configuration:")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Epochs: {NUM_TRAIN_EPOCHS}")
print(f"Batch size: {PER_DEVICE_TRAIN_BATCH_SIZE}")
print(f"Gradient accumulation: {GRADIENT_ACCUMULATION_STEPS}")
print(f"Max length: {MAX_LENGTH}")
print(f"Max prompt length: {MAX_PROMPT_LENGTH}")
print(f"Beta (KL penalty): {BETA}")
print(f"Using {'bfloat16' if is_bfloat16_supported() else 'float16'}")

## Initialize GRPO Trainer

In [None]:
# Initialize GRPO trainer
grpo_trainer = GRPOTrainer(
    model=model,
    args=training_args,
    train_dataset=formatted_dataset,
    tokenizer=tokenizer,
    max_length=MAX_LENGTH,
    max_prompt_length=MAX_PROMPT_LENGTH,
    beta=BETA,
    formatting_func=formatting_prompts_func,
)

print("✅ Unsloth GRPO Trainer initialized successfully!")
print(f"Model class: {model.__class__.__name__}")
print(f"Trainer class: {grpo_trainer.__class__.__name__}")
print(f"Training samples: {len(formatted_dataset):,}")
print(f"Effective batch size: {PER_DEVICE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")

## Start GRPO Training

In [None]:
# Start training
print("🚀 Starting Unsloth GRPO Training...")
print(f"Training {NUM_TRAIN_EPOCHS} epoch(s) on {len(formatted_dataset):,} samples")
print("="*60)

# Record start time
start_time = datetime.now()
print(f"Training started at: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")

# Train the model
trainer_stats = grpo_trainer.train()

# Record end time
end_time = datetime.now()
training_duration = end_time - start_time

print("\n" + "="*60)
print("🎉 GRPO Training Completed!")
print(f"Training finished at: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Total training time: {training_duration}")
print(f"Final training loss: {trainer_stats.training_loss:.4f}")
print(f"Training steps: {trainer_stats.global_step}")

## Save Model

In [None]:
# Save the final model
print("💾 Saving trained model...")

# Save model and tokenizer
model.save_pretrained(f"{OUTPUT_DIR}/final_model")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/final_model")

print(f"✅ Model saved to: {OUTPUT_DIR}/final_model")

# Save training summary
training_summary = {
    "model_name": MODEL_NAME,
    "dataset": DATASET_NAME,
    "training_method": "Unsloth GRPO",
    "total_samples": len(formatted_dataset),
    "training_config": {
        "epochs": NUM_TRAIN_EPOCHS,
        "learning_rate": LEARNING_RATE,
        "batch_size": PER_DEVICE_TRAIN_BATCH_SIZE,
        "gradient_accumulation_steps": GRADIENT_ACCUMULATION_STEPS,
        "max_length": MAX_LENGTH,
        "max_prompt_length": MAX_PROMPT_LENGTH,
        "beta": BETA,
    },
    "training_results": {
        "final_loss": trainer_stats.training_loss,
        "total_steps": trainer_stats.global_step,
        "training_duration": str(training_duration),
    },
    "timestamps": {
        "start_time": start_time.isoformat(),
        "end_time": end_time.isoformat(),
    },
    "model_path": f"{OUTPUT_DIR}/final_model",
}

# Save summary
with open(f"{OUTPUT_DIR}/training_summary.json", "w") as f:
    json.dump(training_summary, f, indent=2)

print(f"Training summary saved to: {OUTPUT_DIR}/training_summary.json")

## Test Trained Model

In [None]:
# Test the trained model
print("🧪 Testing the trained GRPO model...")

# Prepare model for inference
FastLanguageModel.for_inference(model)

# Test sample
test_messages = [
    {"role": "system", "content": "You are an expert Bitcoin market analyst. Provide accurate and insightful analysis."},
    {"role": "user", "content": "Based on recent market trends and news, what is your Bitcoin price prediction for the next week? Please provide detailed analysis."}
]

# Format with chat template
test_prompt = tokenizer.apply_chat_template(
    test_messages,
    tokenize=False,
    add_generation_prompt=True
)

print("Test prompt:")
print(test_prompt)
print("\n" + "="*50)

# Generate response
inputs = tokenizer(test_prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

# Decode response
response = tokenizer.decode(
    outputs[0][len(inputs.input_ids[0]):],
    skip_special_tokens=True
)

print("Model Response:")
print(response)
print("\n✅ Model testing completed!")

## Training Summary

In [None]:
print("📊 Unsloth GRPO Training Summary")
print("=" * 50)
print(f"🤖 Model: {MODEL_NAME}")
print(f"📚 Dataset: {DATASET_NAME}")
print(f"📈 Training method: Unsloth GRPO (Group Relative Policy Optimization)")
print(f"📝 Total samples: {len(formatted_dataset):,}")
print()
print("🎯 Training Configuration:")
print(f"  • Epochs: {NUM_TRAIN_EPOCHS}")
print(f"  • Learning rate: {LEARNING_RATE}")
print(f"  • Batch size: {PER_DEVICE_TRAIN_BATCH_SIZE}")
print(f"  • Gradient accumulation: {GRADIENT_ACCUMULATION_STEPS}")
print(f"  • Effective batch size: {PER_DEVICE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
print(f"  • Max length: {MAX_LENGTH}")
print(f"  • Max prompt length: {MAX_PROMPT_LENGTH}")
print(f"  • Beta (KL penalty): {BETA}")
print(f"  • LoRA rank: {LORA_R}")
print()
print("📊 Training Results:")
print(f"  • Final loss: {trainer_stats.training_loss:.4f}")
print(f"  • Training steps: {trainer_stats.global_step:,}")
print(f"  • Training duration: {training_duration}")
print()
print("💾 Outputs:")
print(f"  • Model saved to: {OUTPUT_DIR}/final_model")
print(f"  • Summary saved to: {OUTPUT_DIR}/training_summary.json")
print()
print("🔬 Key Features:")
print("  ✅ Unsloth-optimized GRPO training")
print("  ✅ Memory-efficient 4-bit quantization")
print("  ✅ LoRA parameter-efficient fine-tuning")
print("  ✅ Preference learning for Bitcoin analysis")
print("  ✅ Chat template formatting")
print("  ✅ Gradient checkpointing for memory optimization")
print()
print("🎉 Unsloth GRPO training completed successfully!")
print("📈 Model ready for Bitcoin prediction tasks!")