In [9]:
!pip install datasets transformers peft trl accelerate torch scikit-learn


Collecting trl
  Using cached trl-0.17.0-py3-none-any.whl.metadata (12 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_

In [3]:
%%bash
# src/ folder for scripts + processed data + plots
mkdir -p data outputs plots src

In [4]:
%%bash
cat > src/data.py <<'EOF'
# src/data.py
import argparse
from datasets import load_dataset

SYSTEM_PROMPT = (
    "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. "
    "The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. "
    "The reasoning process and answer are enclosed within <think></think> and <answer></answer> tags."
)

def make_conversation(example):
    return {
        "prompt": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user",   "content": example["problem"]},
        ],
        "solution": example["solution"],
    }

def main():
    p = argparse.ArgumentParser()
    p.add_argument("--dataset",     type=str, required=True)
    p.add_argument("--split-train", type=str, default="train[:5%]")
    p.add_argument("--split-test",  type=str, default="test[:5%]")
    p.add_argument("--output-dir",  type=str, required=True)
    args = p.parse_args()

    train_ds, test_ds = load_dataset(
        args.dataset,
        split=[ args.split_train, args.split_test ]
    )
    train_ds = train_ds.map(make_conversation, remove_columns=train_ds.column_names)
    test_ds  = test_ds.map(make_conversation,  remove_columns=test_ds.column_names)

    train_ds.save_to_disk(f"{args.output_dir}/train")
    test_ds.save_to_disk( f"{args.output_dir}/test")

if __name__ == "__main__":
    main()
EOF

In [10]:
!python src/data.py \
  --dataset AI-MO/NuminaMath-TIR \
  --split-train train[:5%] \
  --split-test  test[:5%] \
  --output-dir data

README.md: 100% 2.43k/2.43k [00:00<00:00, 13.0MB/s]
train-00000-of-00001.parquet: 100% 147M/147M [00:00<00:00, 205MB/s]
test-00000-of-00001.parquet: 100% 215k/215k [00:00<00:00, 289MB/s]
Generating train split: 100% 72441/72441 [00:02<00:00, 32642.85 examples/s]
Generating test split: 100% 99/99 [00:00<00:00, 17641.84 examples/s]
Map: 100% 3622/3622 [00:00<00:00, 6922.02 examples/s]
Map: 100% 5/5 [00:00<00:00, 865.27 examples/s]
Saving the dataset (1/1 shards): 100% 3622/3622 [00:00<00:00, 225280.18 examples/s]
Saving the dataset (1/1 shards): 100% 5/5 [00:00<00:00, 1168.92 examples/s]


In [11]:
%%bash
cat > src/verify_data.py <<'EOF'
# src/verify_data.py

from datasets import load_from_disk

def sanity_check_prompts(ds, split_name):
    bad = []
    for i, ex in enumerate(ds):
        # 1) prompt must be a list of two messages
        if not isinstance(ex["prompt"], list) or len(ex["prompt"]) != 2:
            bad.append((i, "wrong prompt structure"))
            continue

        sys_msg, user_msg = ex["prompt"]

        # 2) system message should contain <think>… and mention <answer> tags in guidance
        if "<think>" not in sys_msg["content"] or "<answer>" not in sys_msg["content"]:
            bad.append((i, "system prompt missing <think> or <answer> guidance"))

        # 3) user message should be clean (no tags yet)
        if "<think>" in user_msg["content"] or "<answer>" in user_msg["content"]:
            bad.append((i, "user prompt already has tags"))

        # 4) solution field must exist and be non-empty
        if "solution" not in ex or not ex["solution"].strip():
            bad.append((i, "missing or empty solution"))

    if bad:
        print(f"\n⚠️  Found {len(bad)} issues in {split_name} split:")
        for idx, msg in bad[:10]:
            print(f"  • #{idx}: {msg}")
        raise RuntimeError(f"{split_name} sanity check failed")
    print(f"✅ All {len(ds)} examples in {split_name} look good.")


def verify_counts(ds, expected_count, split_name):
    actual = len(ds)
    print(f"{split_name:>5}: {actual} examples (expected {expected_count})")
    if expected_count is not None and actual != expected_count:
        raise RuntimeError(f"{split_name} has {actual} examples but expected {expected_count}")


def main():
    # adjust these if we ever need change our splits
    expected = {"train": 3622, "test": 5}

    for split in ["train", "test"]:
        path = f"data/{split}"
        ds = load_from_disk(path)
        verify_counts(ds, expected[split], split)
        sanity_check_prompts(ds, split)

    print("\n🎉 All data splits passed verification!\n")

    # Show the first 5 test examples
    print("Here are your 5 test examples:\n")
    test_ds = load_from_disk("data/test")
    for i, ex in enumerate(test_ds):
        sys_msg, user_msg = ex["prompt"]
        print(f"--- example {i} ---")
        print("system:", sys_msg["content"])
        print("user:  ", user_msg["content"])
        print()
        if i >= 4:
            break


if __name__ == "__main__":
    main()

EOF

In [12]:
!python src/verify_data.py

train: 3622 examples (expected 3622)
✅ All 3622 examples in train look good.
 test: 5 examples (expected 5)
✅ All 5 examples in test look good.

🎉 All data splits passed verification!

Here are your 5 test examples:

--- example 0 ---
system: A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think></think> and <answer></answer> tags.
user:   In 1988, a person's age was equal to the sum of the digits of their birth year. How old was this person?

--- example 1 ---
system: A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think></think> and <answer></answer> tags.
user:

In [45]:
%%bash
cat > src/train.py <<'EOF'
#!/usr/bin/env python3
# src/train.py

import argparse
import logging
import os
import sys
import time
from typing import List, Tuple, Union

import torch
from datasets import load_from_disk
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import GRPOConfig, GRPOTrainer

# ─── Hyperparameters & Defaults ─────────────────────────────────────────────
DEFAULT_LR             = 1e-5
DEFAULT_LORA_R         = 8
DEFAULT_LORA_ALPHA     = 32
DEFAULT_LORA_DROPOUT   = 0.1
DEFAULT_EPOCHS         = 1
DEFAULT_BATCH_SIZE     = 4
DEFAULT_GRAD_ACCUM     = 1
DEFAULT_MAX_PROMPT     = 128
DEFAULT_MAX_COMPLETION = 64
DEFAULT_NUM_GEN        = 4

# ─── Logging Setup ───────────────────────────────────────────────────────────
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(message)s",
    handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)

# ─── Utility: Sequence Statistics ─────────────────────────────────────────────
def compute_seq_stats(
    full_text: str,
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
) -> Tuple[float, float]:
    """
    Given prompt+generation, regenerate with output_scores=True,
    then compute:
      - average token log-prob
      - maximum token entropy (the “spike”)
    """
    inputs = tokenizer(full_text, return_tensors="pt").to(model.device)
    seq_len = inputs["input_ids"].shape[1]
    max_len = max(200, seq_len + 1)  # ensure max_length > input length
    generation = model.generate(
        **inputs,
        max_length=max_len,
        return_dict_in_generate=True,
        output_scores=True,
    )
    token_ids   = generation.sequences[0][1:].tolist()
    logits_list = generation.scores

    log_probs = []
    entropies = []
    for tok_id, logits in zip(token_ids, logits_list):
        probs = torch.softmax(logits[0], dim=-1)
        logp  = torch.log(probs[tok_id] + 1e-20).item()
        ent   = -(probs * torch.log(probs + 1e-20)).sum().item()
        log_probs.append(logp)
        entropies.append(ent)

    avg_logp  = sum(log_probs) / len(log_probs)
    max_spike = max(entropies)
    return avg_logp, max_spike

# ─── Reward Functions (closures capturing model & tokenizer) ────────────────
def make_logprob_reward(model, tokenizer):
    def logprob_reward(
        prompts: List[List[dict]],
        completions: List[Union[str, List[str], List[dict]]],
        **_
    ):
        rewards = []
        for prompt, gen in zip(prompts, completions):
            # normalize gen into one string
            if isinstance(gen, str):
                gen_text = gen
            elif isinstance(gen, list):
                if gen and isinstance(gen[0], dict) and "content" in gen[0]:
                    gen_text = "".join(turn["content"] for turn in gen)
                else:
                    gen_text = "".join(gen)
            else:
                raise TypeError(f"Unexpected completion type: {type(gen)}")

            prompt_str = " ".join(turn["content"] for turn in prompt)
            avg_lp, _ = compute_seq_stats(prompt_str + gen_text, model, tokenizer)
            rewards.append(avg_lp)

        logger.info(f"  ▶ logprob_reward mean: {sum(rewards)/len(rewards):.3f}")
        return rewards
    return logprob_reward

def make_entropy_reward(model, tokenizer):
    def entropy_reward(
        prompts: List[List[dict]],
        completions: List[Union[str, List[str], List[dict]]],
        **_
    ):
        rewards = []
        for prompt, gen in zip(prompts, completions):
            # normalize gen into one string
            if isinstance(gen, str):
                gen_text = gen
            elif isinstance(gen, list):
                if gen and isinstance(gen[0], dict) and "content" in gen[0]:
                    gen_text = "".join(turn["content"] for turn in gen)
                else:
                    gen_text = "".join(gen)
            else:
                raise TypeError(f"Unexpected completion type: {type(gen)}")

            prompt_str = " ".join(turn["content"] for turn in prompt)
            _, max_ent = compute_seq_stats(prompt_str + gen_text, model, tokenizer)
            rewards.append(-max_ent)

        logger.info(f"  ▶ entropy_reward mean: {sum(rewards)/len(rewards):.3f}")
        return rewards
    return entropy_reward

# ─── Argument Parsing ────────────────────────────────────────────────────────
def parse_args():
    p = argparse.ArgumentParser(
        description="LoRA + GRPO training with logprob + entropy‐spike rewards"
    )
    p.add_argument("--model-id",   required=True,
                   help="Base HF model ID, e.g. Qwen/Qwen2-0.5B-Instruct")
    p.add_argument("--data-dir",   required=True,
                   help="Directory containing `train/` split from data preprocessing")
    p.add_argument("--output-dir", required=True)
    p.add_argument("--epochs",     type=int, default=DEFAULT_EPOCHS)
    p.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
    p.add_argument("--lr", "--learning-rate",
                   dest="lr", type=float, default=DEFAULT_LR,
                   help="Learning rate")
    return p.parse_args()

# ─── Main ────────────────────────────────────────────────────────────────────
def main():
    args = parse_args()

    # 1) Load preprocessed train split
    train_path = os.path.join(args.data_dir, "train")
    logger.info(f"Loading dataset from {train_path} …")
    train_ds = load_from_disk(train_path)

    # 2) Load base model & apply LoRA
    logger.info(f"Loading base model ({args.model_id}) + applying LoRA…")
    base_model = AutoModelForCausalLM.from_pretrained(
        args.model_id, device_map="auto", torch_dtype="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(args.model_id, use_fast=True)
    lora_cfg  = LoraConfig(
        task_type="CAUSAL_LM",
        r=DEFAULT_LORA_R,
        lora_alpha=DEFAULT_LORA_ALPHA,
        lora_dropout=DEFAULT_LORA_DROPOUT,
        target_modules=["q_proj","v_proj"],
    )
    model = get_peft_model(base_model, lora_cfg)
    model.print_trainable_parameters()

    # 3) Make our two reward functions
    logprob_reward = make_logprob_reward(model, tokenizer)
    entropy_reward = make_entropy_reward(model, tokenizer)

    # 4) Configure GRPO
    logger.info("Configuring GRPO trainer…")
    grpo_cfg = GRPOConfig(
        output_dir=args.output_dir,
        learning_rate=args.lr,
        num_train_epochs=args.epochs,
        gradient_accumulation_steps=DEFAULT_GRAD_ACCUM,
        bf16=True,
        max_prompt_length=DEFAULT_MAX_PROMPT,
        max_completion_length=DEFAULT_MAX_COMPLETION,
        num_generations=DEFAULT_NUM_GEN,
        remove_unused_columns=False,
        report_to=["tensorboard"],
        logging_steps=1,
        save_strategy="steps",
        save_steps=100,
        push_to_hub=False,
    )
    trainer = GRPOTrainer(
        model=model,
        args=grpo_cfg,
        train_dataset=train_ds,
        reward_funcs=[logprob_reward, entropy_reward]
    )

    # 5) Train!
    logger.info("Starting training…")
    t0 = time.time()
    trainer.train()
    trainer.save_model(args.output_dir)
    logger.info(f"Done in {(time.time()-t0)/60:.1f} min")

if __name__ == "__main__":
    main()
EOF


In [46]:
import torch
print(torch.cuda.is_available())


True


In [47]:
!python src/train.py \
  --model-id Qwen/Qwen2-0.5B-Instruct \
  --data-dir data \
  --output-dir outputs \
  --epochs 1 \
  --batch-size 4 \
  --lr 1e-5

2025-05-22 23:18:34.354770: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747955914.374668    6380 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747955914.380727    6380 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-22 23:18:34.401423: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
trainable 

In [4]:
%%bash
cat > src/eval.py <<'EOF'
#!/usr/bin/env python3
import argparse, time, re, os
import numpy as np
import torch
import matplotlib.pyplot as plt
from datasets import load_from_disk
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from sklearn.metrics import (
    roc_auc_score, precision_recall_curve, auc, f1_score,
    confusion_matrix, precision_score, recall_score
)

def parse_args():
    p = argparse.ArgumentParser(description="Comprehensive evaluation of fine-tuned LLM + detectors")
    p.add_argument("--model-dir",   required=True, help="Path or HF repo ID of your fine-tuned adapter")
    p.add_argument("--base-model",  required=True, help="Base LLM repo ID (must match fine-tune)")
    p.add_argument("--data-dir",    default="data", help="Either a `test/` subfolder or dataset root")
    p.add_argument("--num-samples", type=int, default=100, help="How many examples to eval")
    p.add_argument("--plots-dir",   default="plots", help="Where to save histogram & curve PNGs")
    return p.parse_args()

def generate_and_score(prompt, model, tokenizer):
    # build input string
    text = " ".join(turn["content"] for turn in prompt)
    # tokenize
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    # generation
    start = time.time()
    out = model.generate(
        **inputs,
        max_length=200,
        return_dict_in_generate=True,
        output_scores=True,
    )
    elapsed = time.time() - start
    seq = out.sequences[0]
    # decode
    generated = tokenizer.decode(seq, skip_special_tokens=True)
    gen_len   = seq.size(-1) - inputs.input_ids.size(-1)
    # per-token scores
    logps, ents = [], []
    for tid, logits in zip(seq[1:], out.scores):
        probs = torch.softmax(logits[0], dim=-1)
        logps.append(torch.log(probs[tid] + 1e-20).item())
        ents.append(-(probs * torch.log(probs + 1e-20)).sum().item())
    return generated, elapsed, gen_len, float(np.mean(logps)), float(np.mean(ents))

def find_best_threshold(scores, labels):
    prec, rec, thresh = precision_recall_curve(labels, scores)
    f1 = 2*prec*rec/(prec+rec+1e-20)
    ix = np.nanargmax(f1)
    return thresh[ix], f1[ix], prec[ix], rec[ix]

def main():
    args = parse_args()
    os.makedirs(args.plots_dir, exist_ok=True)

    # 1) load model
    base  = AutoModelForCausalLM.from_pretrained(args.base_model, device_map="auto", torch_dtype="auto")
    model = PeftModel.from_pretrained(base, args.model_dir, device_map="auto")
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(args.base_model, use_fast=True)

    # 2) load data
    test_path = os.path.join(args.data_dir, "test")
    ds = load_from_disk(test_path) if os.path.isdir(test_path) else load_from_disk(args.data_dir)
    n = min(args.num_samples, len(ds))

    # 3) run eval
    times, lengths, logps, ents = [], [], [], []
    fmt_ok = corr_ok = 0
    labels = []

    for ex in ds.select(range(n)):
        out, t, L, lp, ent = generate_and_score(ex["prompt"], model, tokenizer)
        times.append(t); lengths.append(L); logps.append(lp); ents.append(ent)
        lab = int(ex.get("is_hallucinated", 0))
        labels.append(lab)
        # format check
        if re.match(r"^<think>.*?</think>\s*<answer>.*?</answer>$", out.strip(), flags=re.DOTALL):
            fmt_ok += 1
        # exact-match
        m = re.search(r"<answer>(.*?)</answer>", out, flags=re.DOTALL)
        ans = m.group(1).strip() if m else ""
        if ans == ex["solution"].strip():
            corr_ok += 1

    labels = np.array(labels)
    times, lengths = np.array(times), np.array(lengths)
    logps, ents   = np.array(logps), np.array(ents)

    # 4) metrics
    fmt_pct  = fmt_ok/n
    corr_pct = corr_ok/n
    auroc_lp = roc_auc_score(labels, -logps) if len(np.unique(labels))>1 else np.nan
    auroc_ent= roc_auc_score(labels, ents)    if len(np.unique(labels))>1 else np.nan

    # best-threshold F1 for each
    th_lp,  f1_lp,  p_lp,  r_lp  = find_best_threshold(-logps, labels)
    th_ent, f1_ent, p_ent, r_ent = find_best_threshold(ents, labels)
    # union detector
    pred_union = ((-logps>=th_lp) | (ents>=th_ent)).astype(int)
    f1_union  = f1_score(labels, pred_union)
    prec_u, rec_u = precision_score(labels, pred_union), recall_score(labels, pred_union)

    # 5) print summary
    print("\n## Evaluation Summary\n")
    print(f"Avg latency (s):        {times.mean():.3f} ± {times.std():.3f}")
    print(f"Avg tokens:             {lengths.mean():.1f} ± {lengths.std():.1f}\n")
    print(f"Avg token log-prob:     {logps.mean():.3f} ± {logps.std():.3f}")
    print(f"Avg token entropy:      {ents.mean():.3f} ± {ents.std():.3f}\n")
    print(f"AUROC (log-prob detector) : {auroc_lp:.3f}")
    print(f"AUROC (entropy detector)  : {auroc_ent:.3f}\n")
    print(f"Best-F1 log-prob @ {th_lp:.3f}: F1={f1_lp:.3f}, prec={p_lp:.3f}, rec={r_lp:.3f}")
    print(f"Best-F1 entropy  @ {th_ent:.3f}: F1={f1_ent:.3f}, prec={p_ent:.3f}, rec={r_ent:.3f}")
    print(f"Union detector: F1={f1_union:.3f}, prec={prec_u:.3f}, rec={rec_u:.3f}\n")

    # 6) save histograms
    for arr, name, xlabel in [
        (times,   "latency",      "Latency (s)"),
        (lengths, "gen_tokens",   "Generated Tokens"),
        (logps,   "logprob",      "Avg log-prob"),
        (ents,    "entropy",      "Avg entropy"),
    ]:
        plt.figure(); plt.hist(arr, bins="auto"); plt.title(xlabel)
        plt.xlabel(xlabel); plt.ylabel("Count"); plt.tight_layout()
        out = os.path.join(args.plots_dir, f"{name}_hist.png")
        plt.savefig(out); plt.close()
        print(f"Saved histogram: {out}")

    # 7) ROC & PR curves
    # ROC
    from sklearn.metrics import roc_curve
    fpr_lp, tpr_lp, _ = roc_curve(labels, -logps)
    fpr_ent, tpr_ent,_= roc_curve(labels, ents)
    plt.figure()
    plt.plot(fpr_lp, tpr_lp, label=f"logprob (AUC={auroc_lp:.2f})")
    plt.plot(fpr_ent, tpr_ent, label=f"entropy (AUC={auroc_ent:.2f})")
    plt.plot([0,1],[0,1],"--", c="gray")
    plt.xlabel("FPR"); plt.ylabel("TPR"); plt.legend(); plt.title("ROC Curve")
    plt.tight_layout()
    fn = os.path.join(args.plots_dir,"roc_curve.png"); plt.savefig(fn); plt.close()
    print(f"Saved ROC curve: {fn}")

    # PR
    plt.figure()
    prec_lp, rec_lp, _ = precision_recall_curve(labels, -logps)
    prec_ent,rec_ent,_= precision_recall_curve(labels, ents)
    auc_lp = auc(rec_lp, prec_lp)
    auc_ent= auc(rec_ent,prec_ent)
    plt.plot(rec_lp, prec_lp, label=f"logprob (AUC={auc_lp:.2f})")
    plt.plot(rec_ent,rec_ent, label=f"entropy (AUC={auc_ent:.2f})")
    plt.xlabel("Recall"); plt.ylabel("Precision"); plt.legend(); plt.title("PR Curve")
    plt.tight_layout()
    fn = os.path.join(args.plots_dir,"pr_curve.png"); plt.savefig(fn); plt.close()
    print(f"Saved PR curve: {fn}")

if __name__ == "__main__":
    main()
EOF

### Quick 5-sample sanity check

In [5]:
!python src/eval.py \
  --model-dir bkhalil1/HaluEval_vLLM \
  --base-model Qwen/Qwen2-0.5B-Instruct \
  --data-dir data \
  --num-samples 5

2025-05-23 04:00:17.379745: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747972817.431453   10274 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747972817.448974   10274 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-23 04:00:17.500211: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
  _warn_prf(averag

### Mid-scale HaluEval run (50 examples)

In [1]:
!pip install --upgrade datasets fsspec
from datasets import load_dataset
import os

# 1) pull the HaluEval “qa” split
raw = load_dataset("pminervini/HaluEval", "qa")

SYSTEM = (
  "A conversation between User and Assistant. The user asks a question, "
  "and the Assistant solves it. The assistant first thinks step-by-step "
  "and then provides the answer. Enclose reasoning in <think>…</think> "
  "and the final answer in <answer>…</answer>."
)

def remap(example):
    prompt = [
        {"role":"system", "content":SYSTEM},
        {"role":"user",   "content":example["question"]},
    ]
    return {
        "prompt": prompt,
        "solution": example["right_answer"],
        "is_hallucinated": int(example["hallucinated_answer"] != example["right_answer"])
    }

# 2) apply remap and write out to data/halu_test/test
ds = raw["data"].map(remap, remove_columns=raw["data"].column_names)
os.makedirs("data/halu_test", exist_ok=True)
ds.save_to_disk("data/halu_test/test")
print(f"Wrote {len(ds)} examples to data/halu_test/test")


Collecting fsspec
  Using cached fsspec-2025.5.0-py3-none-any.whl.metadata (11 kB)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.88k [00:00<?, ?B/s]

data-00000-of-00001.parquet:   0%|          | 0.00/3.75M [00:00<?, ?B/s]

Generating data split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Wrote 10000 examples to data/halu_test/test


In [6]:
# Rough ROC/PR and averages on 50 HaluEval QA instances:
!python src/eval.py \
  --model-dir bkhalil1/HaluEval_vLLM \
  --base-model Qwen/Qwen2-0.5B-Instruct \
  --data-dir data/halu_test \
  --num-samples 50

2025-05-23 04:03:21.402161: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747973001.459497   11003 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747973001.476803   11003 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-23 04:03:21.548496: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Traceback (most re

### Full HaluEval benchmark (~10 k examples)

In [None]:
# Final, high-confidence numbers on the entire HaluEval test split:
!python src/eval.py \
  --model-dir bkhalil1/HaluEval_vLLM \
  --base-model Qwen/Qwen2-0.5B-Instruct \
  --data-dir data/halu_test \
  --num-samples 10000