In [1]:
import os

os.environ["HF_TOKEN"] = ""

In [2]:
import os
import re
import torch
import subprocess
from tqdm import tqdm

# --- 1. Install Dependencies ---
print("Installing libraries (bitsandbytes, transformers, accelerate)...")
# We use subprocess to keep the output clean
subprocess.run(["pip", "install", "-q", "-U",
                "transformers",
                "accelerate",
                "bitsandbytes",
                "datasets",
                "huggingface_hub"], check=True)

from huggingface_hub import login
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

# --- 2. Authentication ---
if "HF_TOKEN" in os.environ:
    print("Token found in environment.")
else:
    print("Enter your Hugging Face Token (Read access):")
    login(add_to_git_credential=False)

# --- 3. Configuration ---
MODEL_ID = "unsloth/meta-Llama-3.1-8B-Instruct"
DATASET_NAME = "openai/gsm8k"
DATASET_CONFIG = "main"
SPLIT = "test"
NUM_SAMPLES = 50
MAX_NEW_TOKENS = 512

# --- 4. Load Quantized Model ---
print(f"Loading {MODEL_ID} in 4-bit quantization...")

# 4-bit Config for Free Colab T4 GPU
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Llama 3 needs a pad token defined if it's missing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Create efficient pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=MAX_NEW_TOKENS,
    do_sample=False,      # Greedy decoding for reproducible benchmarks
    temperature=0.0,      # Deterministic output
    return_full_text=False
)

# --- 5. Load Dataset ---
print(f"Loading {DATASET_NAME}...")
dataset = load_dataset(DATASET_NAME, DATASET_CONFIG, split=SPLIT)

if NUM_SAMPLES:
    print(f"evaluating on the first {NUM_SAMPLES} samples only. Set NUM_SAMPLES = None for full run.")
    dataset = dataset.select(range(NUM_SAMPLES))

# --- 6. Logic for Parsing Answers ---

def extract_number(text):
    # 1. Look for the explicit GSM8K delimiter '####'
    match = re.search(r'####\s*(-?[\d,]+(?:\.\d+)?)', text)
    if match:
        return float(match.group(1).replace(',', ''))

    # 2. Fallback: Look for the very last number in the text
    # This regex handles integers, decimals, and negatives
    numbers = re.findall(r'-?[\d,]+(?:\.\d+)?', text)
    if numbers:
        try:
            return float(numbers[-1].replace(',', ''))
        except ValueError:
            return None
    return None

def create_prompt(question):
    messages = [
        {
            "role": "system",
            "content": "You are an expert mathematician. Solve the math problem. At the very end of your response, strictly output the final answer in this format: #### <number>"
        },
        {"role": "user", "content": question},
    ]
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

# --- 7. Main Evaluation Loop ---
print("\nStarting Evaluation...")
correct_count = 0
results = []

for i, example in tqdm(enumerate(dataset), total=len(dataset)):
    question = example['question']
    gold_answer_str = example['answer']

    # Extract the "Gold" (Correct) number
    gold_number = extract_number(gold_answer_str)

    # Generate Model Answer
    prompt = create_prompt(question)
    output = pipe(prompt)[0]['generated_text']

    # Extract Model's number
    pred_number = extract_number(output)

    # Check correctness (allowing for small float precision errors)
    is_correct = False
    if pred_number is not None and gold_number is not None:
        if abs(pred_number - gold_number) < 1e-4:
            is_correct = True

    if is_correct:
        correct_count += 1

    results.append({
        "question": question,
        "gold": gold_number,
        "pred": pred_number,
        "correct": is_correct
    })

    # Print the first 3 examples to show it's working
    if i < 3:
        print(f"\n--- Q{i+1} ---")
        print(f"Question: {question[:100]}...")
        print(f"Reasoning:\n{output[:200]}... [truncated]")
        print(f"Gold: {gold_number} | Pred: {pred_number} | Correct: {is_correct}")

# --- 8. Final Report ---
accuracy = (correct_count / len(dataset)) * 100
print("\n" + "="*40)
print(f"EVALUATION COMPLETE")
print(f"Model: {MODEL_ID} (4-bit)")
print(f"Samples: {len(dataset)}")
print(f"Accuracy: {accuracy:.2f}%")
print("="*40)

Installing libraries (bitsandbytes, transformers, accelerate)...
Token found in environment.
Loading unsloth/meta-Llama-3.1-8B-Instruct in 4-bit quantization...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/956 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Loading openai/gsm8k...


README.md: 0.00B [00:00, ?B/s]

main/train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

main/test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

evaluating on the first 50 samples only. Set NUM_SAMPLES = None for full run.

Starting Evaluation...


  2%|▏         | 1/50 [00:12<10:22, 12.70s/it]


--- Q1 ---
Question: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for ...
Reasoning:
To find out how much Janet makes at the farmers' market, we need to calculate the number of eggs she sells and multiply it by the price per egg.

Janet lays 16 eggs per day. She eats 3 eggs for breakf... [truncated]
Gold: 18.0 | Pred: 18.0 | Correct: True


  4%|▍         | 2/50 [00:22<08:46, 10.98s/it]


--- Q2 ---
Question: A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it ...
Reasoning:
To find the total number of bolts of fiber needed, we need to calculate the amount of white fiber first. Since it's half the amount of blue fiber, and there are 2 bolts of blue fiber, the amount of wh... [truncated]
Gold: 3.0 | Pred: 3.0 | Correct: True


  6%|▌         | 3/50 [00:43<12:13, 15.61s/it]


--- Q3 ---
Question: Josh decides to try flipping a house.  He buys a house for $80,000 and then puts in $50,000 in repai...
Reasoning:
To find the profit Josh made, we need to first find the new value of the house after the repairs. 

The original value of the house was $80,000. Josh put in $50,000 in repairs, which increased the val... [truncated]
Gold: 70000.0 | Pred: 70000.0 | Correct: True


 20%|██        | 10/50 [02:47<12:22, 18.57s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 50/50 [14:05<00:00, 16.91s/it]


EVALUATION COMPLETE
Model: unsloth/meta-Llama-3.1-8B-Instruct (4-bit)
Samples: 50
Accuracy: 78.00%



