# GRPO Training for Bitcoin Enhanced Prediction (No Unsloth)

This notebook performs GRPO training using Hugging Face Transformers, TRL, PEFT, and bitsandbytes — without Unsloth. It loads a pre-trained LoRA adapter and optimizes it with a structured-output reward.

- Base model: Qwen 2.5 Instruct (4-bit quantized)
- Adapter: tahamajs/my-awesome-model_final_bitcoin_enhanced_prediction_dataset_with_local_comprehensive_news (checkpoint-1152)
- Dataset: tahamajs/bitcoin-enhanced-prediction-dataset-with-local-comprehensive-news
- Reward: Structured JSON action/confidence/SL/TP/forecast closeness + analysis quality

In [None]:
# Optional installs (uncomment if needed)
# !pip install -U transformers accelerate trl peft bitsandbytes datasets safetensors
# !pip install -U einops sentencepiece

In [None]:
import os, json, random, re
from datetime import datetime
import numpy as np
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import GRPOTrainer, GRPOConfig

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)


In [None]:
# Config
BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct"
LOAD_IN_4BIT = True
ADAPTER_PATH = "tahamajs/my-awesome-model_final_bitcoin_enhanced_prediction_dataset_with_local_comprehensive_news"
CHECKPOINT = "checkpoint-1152"
MAX_SEQ_LEN = 2048
MAX_LENGTH = 1024
MAX_PROMPT_LENGTH = 512
OUTPUT_DIR = "./qwen_bitcoin_enhanced_grpo_no_unsloth"
LEARNING_RATE = 3e-7
EPOCHS = 1
BETA = 0.1
PER_DEVICE_TRAIN_BATCH_SIZE = 1
GRAD_ACCUM_STEPS = 8

DATASET_NAME = "tahamajs/bitcoin-enhanced-prediction-dataset-with-local-comprehensive-news"
REWARD_MODEL_NAME = "microsoft/DialoGPT-medium"

In [None]:
# Load base model in 4bit
quant_config = BitsAndBytesConfig(
    load_in_4bit=LOAD_IN_4BIT,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config if LOAD_IN_4BIT else None,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16,
    device_map="auto",
)

# Attach (and continue training) LoRA
model = prepare_model_for_kbit_training(model)
lora_cfg = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.0,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_cfg)

# Load pre-trained adapter
try:
    adapter_path = f"{ADAPTER_PATH}/{CHECKPOINT}"
    model = PeftModel.from_pretrained(model, adapter_path)
    print(f"Loaded adapter from {adapter_path}")
except Exception as e:
    print(f"Warning: couldn't load adapter: {e}")

# Load reward model (optional)
try:
    reward_tokenizer = AutoTokenizer.from_pretrained(REWARD_MODEL_NAME)
    reward_model = AutoModelForCausalLM.from_pretrained(REWARD_MODEL_NAME, device_map="auto")
    reward_model.eval()
    print("Reward model loaded")
except Exception as e:
    print(f"Reward model unavailable, fallback to rule-based only: {e}")
    reward_model, reward_tokenizer = None, None

In [None]:
# Load dataset
raw_ds = load_dataset(DATASET_NAME, split="train")
print(f"Loaded dataset {DATASET_NAME} with {len(raw_ds):,} samples")

# Convert to TRL chat format: list of dicts with 'conversations'
def to_conversations(examples):
    instructions = examples.get("instruction", [""] * len(examples.get("input", [])))
    inputs = examples.get("input", [])
    outputs = examples.get("output", [])
    chats = []
    for inst, inp, out in zip(instructions, inputs, outputs):
        chats.append([
            {"role": "system", "content": inst or "You are a helpful Bitcoin market analyst."},
            {"role": "user", "content": inp or ""},
            {"role": "assistant", "content": out or ""},
        ])
    return {"conversations": chats}

train_ds = raw_ds.map(to_conversations, batched=True, remove_columns=raw_ds.column_names)
print("Example conv:")
for m in train_ds[0]["conversations"]:
    print(m["role"], m["content"][:120])

In [None]:
# Structured-output helpers

def parse_trading_output(text):
    if not text:
        return None
    try:
        return json.loads(text.strip())
    except:
        pass
    try:
        pattern = r'\{[^{}]*"action"[^{}]*\}'
        matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
        if matches:
            for m in matches:
                try:
                    return json.loads(m.strip())
                except:
                    continue
        result = {}
        m = re.search(r'"action"\s*:\s*"([^"]+)"', text, re.IGNORECASE)
        if m: result['action'] = m.group(1).upper()
        m = re.search(r'"confidence"\s*:\s*(\d+(?:\.\d+)?)', text, re.IGNORECASE)
        if m: result['confidence'] = float(m.group(1))
        m = re.search(r'"stop_loss"\s*:\s*(\d+(?:\.\d+)?)', text, re.IGNORECASE)
        if m: result['stop_loss'] = float(m.group(1))
        m = re.search(r'"take_profit"\s*:\s*(\d+(?:\.\d+)?)', text, re.IGNORECASE)
        if m: result['take_profit'] = float(m.group(1))
        m = re.search(r'"forecast_10d"\s*:\s*\[([^\]]+)\]', text, re.IGNORECASE)
        if m:
            try:
                vals = [float(x.strip()) for x in m.group(1).split(',')]
                result['forecast_10d'] = vals
            except:
                pass
        return result if result else None
    except:
        return None


def calculate_forecast_similarity(resp_forecast, gt_forecast):
    if not resp_forecast or not gt_forecast: return 0.0
    import numpy as np
    try:
        r = np.array([float(x) for x in resp_forecast if isinstance(x, (int, float, str))])
        g = np.array([float(x) for x in gt_forecast if isinstance(x, (int, float, str))])
        if len(r) == 0 or len(g) == 0: return 0.0
        n = min(len(r), len(g))
        r, g = r[:n], g[:n]
        if n < 2: return 0.0
        score = 0.0
        # corr (40%)
        try:
            corr = np.corrcoef(r, g)[0,1]
            if not np.isnan(corr): score += abs(corr) * 0.4
        except: pass
        # directional (30%)
        rd, gd = np.diff(r) > 0, np.diff(g) > 0
        if len(rd) > 0: score += (np.mean(rd == gd) * 0.3)
        # magnitude (30%)
        try:
            rn = (r - r.mean()) / (r.std() + 1e-8)
            gn = (g - g.mean()) / (g.std() + 1e-8)
            mse = ((rn - gn) ** 2).mean()
            mag = max(0, 1 - (mse / 4))
            score += mag * 0.3
        except: pass
        return float(min(1.0, max(0.0, score)))
    except:
        return 0.0


def calculate_comprehensive_prediction_reward(response, ground_truth, reward_model=None, reward_tokenizer=None):
    total_reward = 0.0
    # 1) Prediction quality (20%)
    pred = 0.0
    L = len(response)
    if 150 <= L <= 1000: pred += 0.15
    elif 100 <= L <= 1200: pred += 0.10
    elif L < 100: pred -= 0.05
    kw = {
        'prediction': 0.08,'forecast':0.08,'analysis':0.06,'market':0.06,
        'bitcoin':0.05,'btc':0.04,'price':0.07,'trend':0.06,
        'technical':0.05,'fundamental':0.05,'news':0.05,'data':0.04,
        'indicators':0.05,'volume':0.04,'resistance':0.04,'support':0.04,
        'momentum':0.04,'correlation':0.03,'pattern':0.04,'signal':0.04
    }
    rl = response.lower()
    for k,w in kw.items():
        if k in rl: pred += w
    total_reward += pred * 0.20

    # 2) Technical analysis (15%)
    tech = 0.0
    terms = ['rsi','macd','moving average','bollinger bands','fibonacci','support level','resistance level','breakout','consolidation','overbought','oversold','bullish divergence','bearish divergence']
    tech += min(0.15, sum(0.03 for t in terms if t in rl))
    patt = ['head and shoulders','double top','double bottom','triangle','flag','pennant','cup and handle','ascending','descending']
    tech += min(0.08, sum(0.02 for t in patt if t in rl))
    total_reward += tech * 0.15

    # 3) News integration (15%)
    news = 0.0
    nts = ['news impact','market sentiment','adoption news','regulatory','institutional','whale activity','exchange','etf','mainstream','correlation with','influenced by','driven by news']
    news += min(0.12, sum(0.03 for t in nts if t in rl))
    if any(t in rl for t in ['multiple factors','combined with','along with','considering','taking into account','综合考虑','holistic','comprehensive']):
        news += 0.08
    total_reward += news * 0.15

    # 4) Specificity (10%)
    spec = 0.0
    price_targets = re.findall(r'\$\d+[,\d]*', response)
    pct_targets = re.findall(r'\d+\.?\d*%', response)
    if price_targets or pct_targets: spec += 0.10
    tfs = ['within.*day','next.*week','coming.*month','short.*term','long.*term','by.*end','Q[1-4]','next.*year']
    if sum(1 for tf in tfs if re.search(tf, rl)) > 0: spec += 0.05
    total_reward += spec * 0.10

    # 5) Structured output (25%)
    structured = 0.0
    rj = parse_trading_output(response)
    gj = parse_trading_output(ground_truth) if ground_truth else None
    if rj:
        structured += 0.05
        req = ['action','confidence','stop_loss','take_profit','forecast_10d']
        present = sum(1 for f in req if f in rj)
        structured += (present/len(req)) * 0.10
        if gj:
            if rj.get('action','').upper() == gj.get('action','').upper():
                structured += 0.08
            rc, gc = rj.get('confidence',0), gj.get('confidence',0)
            if isinstance(rc,(int,float)) and isinstance(gc,(int,float)):
                structured += max(0, 1-abs(rc-gc)/100) * 0.06
            for fld in ['stop_loss','take_profit']:
                rp, gp = rj.get(fld,0), gj.get(fld,0)
                if isinstance(rp,(int,float)) and isinstance(gp,(int,float)) and gp>0:
                    structured += max(0, 1-abs((rp-gp)/gp)) * 0.04
            rf, gf = rj.get('forecast_10d',[]), gj.get('forecast_10d',[])
            if isinstance(rf,list) and isinstance(gf,list) and len(gf)>0:
                structured += calculate_forecast_similarity(rf, gf) * 0.08
        if 'confidence' in rj:
            c = rj.get('confidence',0)
            if isinstance(c,(int,float)) and 0 <= c <= 100: structured += 0.02
        for fld in ['stop_loss','take_profit']:
            p = rj.get(fld,0)
            if isinstance(p,(int,float)) and p>0: structured += 0.01
    else:
        if ground_truth:
            rt = set(response.lower().split())
            gt = set(ground_truth.lower().split())
            if len(gt)>0:
                structured += (len(rt & gt)/len(rt | gt)) * 0.15
    total_reward += structured * 0.25

    # 6) AI reward (10%)
    if reward_model is not None and reward_tokenizer is not None:
        try:
            text = f"Bitcoin Prediction Analysis: {response[:400]}"
            inputs = reward_tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
            inputs = {k: v.to(next(reward_model.parameters()).device) for k,v in inputs.items()}
            with torch.no_grad():
                out = reward_model(**inputs)
                logits = getattr(out, 'logits', None)
                ai_r = torch.sigmoid(logits.mean()).item() if logits is not None else 0.5
                total_reward += ai_r * 0.10
        except:
            pass

    # 7) Bonuses
    bonus = 0.0
    if any(m in response for m in ['1.','2.','3.','•','-','Analysis:','Prediction:','Factors:']):
        bonus += 0.06
    if any(t in rl for t in ['not financial advice','high risk','volatile','dyor','past performance']):
        bonus += 0.04
    total_reward += bonus

    # 8) Penalties
    pen = 0.0
    pen -= min(0.15, sum(0.03 for t in ['guaranteed','definitely will','certain','100%','no doubt'] if t in rl))
    hedges = sum(1 for t in ['maybe','might','could be','possibly','perhaps'] if t in rl)
    if hedges > 2: pen -= 0.05
    total_reward += pen

    return float(max(0.0, min(1.0, total_reward)))

print("Reward function ready")

In [None]:
# Custom Trainer integrating the reward
class CustomGRPOTrainer(GRPOTrainer):
    def __init__(self, reward_model=None, reward_tokenizer=None, **kwargs):
        super().__init__(**kwargs)
        self.reward_model = reward_model
        self.reward_tokenizer = reward_tokenizer

    def compute_reward(self, model_output, reference_output=None):
        try:
            # model_output may be text already depending on TRL internals; be robust
            if isinstance(model_output, str):
                resp_text = model_output
            elif hasattr(model_output, 'sequences'):
                resp_text = self.tokenizer.decode(model_output.sequences[0], skip_special_tokens=True)
            else:
                resp_text = str(model_output)
            ref_text = reference_output if isinstance(reference_output, str) else (str(reference_output) if reference_output is not None else None)
            r = calculate_comprehensive_prediction_reward(resp_text, ref_text, self.reward_model, self.reward_tokenizer)
            return torch.tensor([r], dtype=torch.float32)
        except Exception as e:
            print("Reward error:", e)
            return torch.tensor([0.5], dtype=torch.float32)

print("CustomGRPOTrainer ready")

In [None]:
# Training setup
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    learning_rate=LEARNING_RATE,
    logging_steps=10,
    save_steps=100,
    save_strategy="steps",
    evaluation_strategy="no",
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    optim="adamw_8bit",
    weight_decay=0.01,
    max_grad_norm=1.0,
    fp16=(torch.cuda.is_available() and not torch.cuda.is_bf16_supported()),
    bf16=(torch.cuda.is_available() and torch.cuda.is_bf16_supported()),
    remove_unused_columns=False,
    dataloader_num_workers=2,
    seed=SEED,
    report_to="none",
)

# Formatting function for TRL GRPO

def formatting_func(example):
    return example["conversations"]

trainer = CustomGRPOTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    tokenizer=tokenizer,
    max_length=MAX_LENGTH,
    max_prompt_length=MAX_PROMPT_LENGTH,
    beta=BETA,
    formatting_func=formatting_func,
    reward_model=reward_model,
    reward_tokenizer=reward_tokenizer,
)

print("Trainer initialized")

In [None]:
# Replaced by GRPOConfig above
print("Using TRL GRPOConfig for GRPO training.")

In [None]:
# Train
print("🚀 Starting GRPO training (no Unsloth)...")
start_time = datetime.now()
train_stats = trainer.train()
end_time = datetime.now()
duration = end_time - start_time
print("🎉 Training complete.")
print("Final loss:", getattr(train_stats, 'training_loss', None))
print("Steps:", getattr(train_stats, 'global_step', None))
print("Duration:", duration)

In [None]:
# Save model and summary
save_dir = f"{OUTPUT_DIR}/final_model"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print("Saved model to", save_dir)

summary = {
    "model": BASE_MODEL,
    "adapter": f"{ADAPTER_PATH}/{CHECKPOINT}",
    "dataset": DATASET_NAME,
    "samples": len(train_ds),
    "epochs": EPOCHS,
    "lr": LEARNING_RATE,
    "beta": BETA,
    "max_length": MAX_LENGTH,
    "max_prompt_length": MAX_PROMPT_LENGTH,
    "time": {
        "start": start_time.isoformat(),
        "end": end_time.isoformat(),
        "duration": str(duration),
    },
}
os.makedirs(OUTPUT_DIR, exist_ok=True)
with open(os.path.join(OUTPUT_DIR, "training_summary.json"), "w") as f:
    json.dump(summary, f, indent=2)
print("Summary saved")

In [None]:
# Quick inference test (structured output)
model.eval()

def chat(messages):
    # Simple prompt builder (no Unsloth template); Qwen supports chat format tokens but we'll use plain concat
    prompt = "\n".join([f"{m['role'].upper()}: {m['content']}" for m in messages]) + "\nASSISTANT:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=256, temperature=0.7, do_sample=True, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id)
    text = tokenizer.decode(out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    return text

messages = [
    {"role":"system","content":"You are an expert Bitcoin market analyst. Output ONLY valid JSON with keys action, confidence, stop_loss, take_profit, forecast_10d (10 numbers)."},
    {"role":"user","content":"Give your Bitcoin trading decision for the next 10 days with stop loss, take profit and a 10-day price forecast."}
]
resp = chat(messages)
print("Model raw response:\n", resp)
print("Parsed:", parse_trading_output(resp))

In [None]:
# GRPO reward function wrapper
# Tries to handle different shapes passed by TRL

def grpo_reward_func(texts=None, prompts=None, samples=None, **kwargs):
    rewards = []
    if texts is not None and isinstance(texts, list) and (len(texts) == 0 or isinstance(texts[0], str)):
        # Only model responses provided
        for resp in (texts or []):
            rewards.append(calculate_comprehensive_prediction_reward(resp or "", ground_truth="", reward_model=reward_model, reward_tokenizer=reward_tokenizer))
        return rewards
    
    if samples is not None and isinstance(samples, list):
        for s in samples:
            # Try to extract response and ground truth
            resp = s.get('response') or s.get('text') or s.get('output') or ""
            gt = s.get('ground_truth') or ""
            # If conversations exist, last assistant is GT
            conv = s.get('conversations')
            if not gt and isinstance(conv, list) and len(conv) > 0 and isinstance(conv[-1], dict) and conv[-1].get('role') == 'assistant':
                gt = conv[-1].get('content', '')
            rewards.append(calculate_comprehensive_prediction_reward(resp, gt, reward_model=reward_model, reward_tokenizer=reward_tokenizer))
        return rewards
    
    # Fallback
    return [0.5] * (len(texts) if isinstance(texts, list) else 1)

In [None]:
# Attach reward function if trainer supports it
try:
    trainer.reward_func = grpo_reward_func
    print("Custom reward function attached to trainer.")
except Exception as e:
    print("Could not attach reward function explicitly:", e)

## How to use

1) Optionally install dependencies in Cell 2.
2) Run imports and config cells (Cells 3–5).
3) Run dataset load (Cell 6).
4) Run reward function cells (Cells 7–8).
5) Initialize trainer (Cell 9), then attach reward (Cell 14).
6) Train (Cell 10), save (Cell 11), and test inference (Cell 12).

Note: This notebook does not use Unsloth anywhere.

In [None]:
# Use TRL GRPOConfig instead of HF TrainingArguments
train_config = GRPOConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    learning_rate=LEARNING_RATE,
    logging_steps=10,
    save_steps=100,
    save_strategy="steps",
    evaluation_strategy="no",
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    optim="adamw_8bit",
    weight_decay=0.01,
    max_grad_norm=1.0,
    fp16=(torch.cuda.is_available() and not torch.cuda.is_bf16_supported()),
    bf16=(torch.cuda.is_available() and torch.cuda.is_bf16_supported()),
    dataloader_num_workers=2,
    seed=SEED,
    report_to=[]
)

# Recreate trainer with GRPOConfig
trainer = CustomGRPOTrainer(
    model=model,
    args=train_config,
    train_dataset=train_ds,
    tokenizer=tokenizer,
    max_length=MAX_LENGTH,
    max_prompt_length=MAX_PROMPT_LENGTH,
    beta=BETA,
    formatting_func=formatting_func,
    reward_model=reward_model,
    reward_tokenizer=reward_tokenizer,
)
print("Trainer re-initialized with TRL GRPOConfig")