In [1]:
from pathlib import Path

import argparse
import re
from dataclasses import dataclass
import pandas as pd
import torch

#import outlines
#from outlines.types.dsl import Regex, String, at_most, either, to_regex
#from outlines.types import zero_or_more, one_or_more, optional, whitespace, digit

from pydantic import BaseModel
from pydantic_evals import Dataset
from pydantic_evals.evaluators import Evaluator, EvaluatorContext

from transformers import AutoTokenizer, AutoModelForCausalLM
from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams

from m_gsm_symbolic.kaenguruen.load_data import load_kaenguruen
from m_gsm_symbolic.load_data import load_gsm_dan, load_gsm_eng


if torch.cuda.is_available():
    dev = "cuda"
else:
    dev = "cpu"
device = torch.device(dev)
print(device)

INFO 08-11 09:50:40 [__init__.py:235] Automatically detected platform cuda.
cuda


In [2]:
import nest_asyncio

nest_asyncio.apply()

In [3]:
from dotenv import load_dotenv
import os

#load_dotenv()
#api_key = os.getenv("OPENAI_API_KEY")


from huggingface_hub import login
#login

In [4]:
#judge_llm_name = "openai:gpt-4o-2024-08-06"
response_model_name = "meta-llama/Llama-3.2-1B"

In [5]:

# 1. Outlines regex DSL: reasoning (any text) + '####' + answer (int)
#answer_pattern = to_regex(
#    Regex(pattern=r".").at_most(500) +
#    String("#### ") +
#    digit.one_or_more() +
#    optional(
#        either(
#            String("."), String(",")
#        ) + optional(
#            digit.one_or_more()
#        )
#    )
#)
answer_pattern = r".{,300}####\s\d+"
# Support float or frac as well
    #sequence(
    #    one_or_more(digit()),
    #    optional(either(
    #        sequence(literal("."), one_or_more(digit())),
    #        sequence(literal("/"), one_or_more(digit()))
    #    ))
    #)

class HuggingFaceAgent:
    def __init__(self, model: str, examples: list, 
                 answer_pattern: str = answer_pattern):
        self.tokenizer = AutoTokenizer.from_pretrained(model)
        #self.model = outlines.from_transformers(AutoModelForCausalLM.from_pretrained(model, torch_dtype="auto").to(device).eval(), self.tokenizer)
        
        #self.model = AutoModelForCausalLM.from_pretrained(model, torch_dtype="auto").to(device)
        self.model = LLM(model=model)
        self.cases = examples
        self.answer_pattern = answer_pattern

    def _build_prompt(self, prompt):
        examples = []
        for case in self.cases:
            example = f"Problem: {case.inputs}\n\nSolution: {case.expected_output}"
            examples.append(example)

        prompt = f"Problem: {prompt}\n\nSolution:"
        examples.append(prompt)
        prompt = "\n\n".join(examples)
        return prompt

    def run(self, prompt: str):
        prompt = self._build_prompt(prompt)
        guided_params = GuidedDecodingParams(regex=self.answer_pattern)
        sampling_params = SamplingParams(guided_decoding=guided_params)
        model_output = self.model.generate(
            prompt,
            sampling_params=sampling_params
        )
        #model_output = self.model(prompt, answer_pattern, max_new_tokens=500)
        #inputs = self.tokenizer(prompt, return_tensors="pt")
        #inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
        #model_output = self.model.generate(**inputs, max_new_tokens=500)
        #model_output = self.tokenizer.decode(model_output[0], skip_special_tokens=True)
        return model_output


In [15]:

cases = [p.to_case() for p in load_gsm_eng()]

for i, case in enumerate(cases):
    print(f"Case {i}: {re.search(r'####\s*((\d|,)+)', case.expected_output).group(1).replace(',', '')}")

Case 0: 40
Case 1: 58
Case 2: 30
Case 3: 38
Case 4: 56
Case 5: 4
Case 6: 18
Case 7: 54
Case 8: 210
Case 9: 1050
Case 10: 8
Case 11: 4
Case 12: 5
Case 13: 55
Case 14: 2
Case 15: 11
Case 16: 77
Case 17: 118000
Case 18: 16
Case 19: 540
Case 20: 35
Case 21: 43200
Case 22: 64
Case 23: 12
Case 24: 88
Case 25: 10
Case 26: 130
Case 27: 342
Case 28: 70
Case 29: 320
Case 30: 157
Case 31: 2
Case 32: 25
Case 33: 72
Case 34: 75
Case 35: 140
Case 36: 16
Case 37: 14400
Case 38: 2
Case 39: 3200
Case 40: 100
Case 41: 5
Case 42: 14
Case 43: 4
Case 44: 63
Case 45: 2
Case 46: 15
Case 47: 200
Case 48: 4
Case 49: 694
Case 50: 4000
Case 51: 6
Case 52: 9
Case 53: 159
Case 54: 48
Case 55: 45
Case 56: 100
Case 57: 3
Case 58: 42
Case 59: 48
Case 60: 428
Case 61: 25
Case 62: 75
Case 63: 10
Case 64: 3430
Case 65: 18
Case 66: 78
Case 67: 16
Case 68: 8
Case 69: 49
Case 70: 3
Case 71: 20
Case 72: 34
Case 73: 36
Case 74: 48
Case 75: 1248
Case 76: 9500
Case 77: 16
Case 78: 2
Case 79: 36
Case 80: 98
Case 81: 30
Case 82:

In [None]:
cases = [p.to_case() for p in load_gsm_eng()]

response_model_name = "meta-llama/Llama-3.2-1B"
agent_evaluated = HuggingFaceAgent(response_model_name, examples=cases[-3:], answer_pattern=answer_pattern)

# Custom evaluator: compare only the answer after '####'
@dataclass
class AnswerOnlyMatch(Evaluator):
    def evaluate(self, ctx: EvaluatorContext) -> bool:
        # Extract answer after '####' using regex
        m_pred = re.search(r"####\s*((\d|,)+)", ctx.output)
        m_true = re.search(r"####\s*((\d|,)+)", ctx.expected_output)
        if not m_pred or not m_true:
            return False
        return float(m_pred.group(1).replace(",","")) == float(m_true.group(1).replace(",",""))

ds = Dataset(
    cases=cases[:2],
    evaluators=[AnswerOnlyMatch()],
)

async def answer_question(question: str) -> str:
    r = agent_evaluated.run(question)
    return r

report = ds.evaluate_sync(answer_question)
report.print(include_input=True, include_output=True, include_expected_output=True)

In [6]:
cases = [p.to_case() for p in load_gsm_eng()]
response_model_name = "meta-llama/Llama-3.2-1B"
agent_evaluated = HuggingFaceAgent(response_model_name, examples=cases[-3:], answer_pattern=answer_pattern)
for case in cases[:2]:
    print("Case:")
    print(case)
    print("Output:")
    print(agent_evaluated.run(case.inputs))

INFO 08-11 09:51:00 [config.py:1604] Using max model len 131072
INFO 08-11 09:51:03 [config.py:2434] Chunked prefill is enabled with max_num_batched_tokens=2048.
INFO 08-11 09:51:03 [llm_engine.py:228] Initializing a V0 LLM engine (v0.10.0) with config: model='meta-llama/Llama-3.2-1B', speculative_config=None, tokenizer='meta-llama/Llama-3.2-1B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpo

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 08-11 09:51:09 [default_loader.py:262] Loading weights took 1.01 seconds
INFO 08-11 09:51:10 [model_runner.py:1115] Model loading took 2.3185 GiB and 1.678835 seconds
INFO 08-11 09:51:11 [worker.py:295] Memory profiling takes 0.70 seconds
INFO 08-11 09:51:11 [worker.py:295] the current vLLM instance can use total_gpu_memory (47.27GiB) x gpu_memory_utilization (0.90) = 42.54GiB
INFO 08-11 09:51:11 [worker.py:295] model weights take 2.32GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 1.18GiB; the rest of the memory reserved for KV Cache is 38.99GiB.
INFO 08-11 09:51:11 [executor_base.py:113] # cuda blocks: 79843, # CPU blocks: 8192
INFO 08-11 09:51:11 [executor_base.py:118] Maximum concurrency for 131072 tokens per request: 9.75x
INFO 08-11 09:51:13 [model_runner.py:1385] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in

Capturing CUDA graph shapes:   0%|          | 0/35 [00:00<?, ?it/s]

INFO 08-11 09:51:39 [model_runner.py:1537] Graph capturing finished in 25 secs, took 0.13 GiB
INFO 08-11 09:51:39 [llm_engine.py:424] init engine (profile, create kv cache, warmup model) took 29.09 seconds
Case:
Case(name='661', inputs="Roger goes to the store to buy some coffee. The normal brand of coffee he buys costs $5 per pound. He had to buy a more expensive brand that costs 20% more since his favorite brand was sold out. He decides to buy a week's worth of coffee and he uses 1 pound of coffee per day. He also decided to buy himself a donut for $2. How much did everything cost?", metadata={'filepath': '/home/simonenni/repos/m-gsm-symbolic/data/templates/eng/symbolic/0042.json'}, expected_output='The coffee he bought was 5*0.2=$<<5*0.2=1>>1 more expensive per pound than what he normally buys\nSo it cost 5+1=$<<5+1=6>>6 per pound\nHe goes through 1*7=<<1*7=7>>7 pounds of coffee a week\nSo he paid 6*7=$<<6*7=42>>42 on coffee\nThat means his total bill was 42+2=$<<42+2=44>>44\n#### 4

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Unsupported conversion from f16 to f16
LLVM ERROR: Unsupported rounding mode for conversion.
#blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
#blocked1 = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
#blocked2 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
#blocked3 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}>
#blocked4 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [2, 16], warpsPerCTA = [1, 4], order = [0, 1]}>
#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
#shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1]}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:75", "ttg.threads-per-warp" = 32 : i32} {
  t

RuntimeError: PassManager::run failed

In [7]:
import torch
print(torch.cuda.memory_allocated())
print(torch.cuda.memory_reserved())

2481197568
2650800128


In [None]:
device = "cuda"
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", torch_dtype="auto").to(device)

In [18]:
cases[:1]

[Case(name='728', inputs='Candy has 15 light blue spools of thread, 45 dark blue spools of thread, 40 light green spools of thread, and 50 dark green spools of thread. What percent of her spools are blue?', metadata={'filepath': '/home/au338890/repos/m-gsm-symbolic/data/templates/eng/symbolic/0066.json'}, expected_output='First find the number of blue spools: 15 spools + 45 spools = <<15+45=60>>60 spools\nThen find the total number of spools: 40 spools + 50 spools + 60 spools = <<40+50+60=150>>150 spools\nThen divide the number of blue spools by the total number of spools and multiply by 100% to express the answer as a percentage: 60 spools / 150 spools * 100% = 40%\n#### 40', evaluators=[])]

In [19]:
agent_evaluated = HuggingFaceAgent(response_model_name, examples=cases[-1:])

In [20]:

async def answer_question(question: str) -> str:
    r = agent_evaluated.run(question)
    return r

report_2 = ds.evaluate_sync(answer_question)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  + Exception Group Traceback (most recent call last):
  |   File "/home/au338890/repos/m-gsm-symbolic/.venv/lib/python3.13/site-packages/IPython/core/interactiveshell.py", line 3672, in run_code
  |     exec(code_obj, self.user_global_ns, self.user_ns)
  |     ~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  |   File "/tmp/ipykernel_10327/1812324380.py", line 5, in <module>
  |     report_2 = ds.evaluate_sync(answer_question)
  |   File "/home/au338890/repos/m-gsm-symbolic/.venv/lib/python3.13/site-packages/pydantic_evals/dataset.py", line 315, in evaluate_sync
  |     return get_event_loop().run_until_complete(self.evaluate(task, name=name, max_concurrency=max_concurrency))
  |            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  |   File "/home/au338890/repos/m-gsm-symbolic/.venv/lib/python3.13/site-packages/nest_asyncio.py", line 98, in run_until_

In [14]:
report_2.print(include_input=True, include_output=True, include_expected_output=True)

NameError: name 'report_2' is not defined

In [None]:
print(df.loc[0].output)



Hvis hver pirat svarede sandt til den givne opgave, skal et antal mønter, der er tilfældigt resulterer i et antal mønter, der er lik 30.
I et ufuldstændigt skattecontainer, skal du vurdere, hvor mange mønter, der er i containeren, der er i den rigtige mønter.
Antallet af mønter, der er i den rigtige mønter, er 10.

Før du tager beslutningen, skal du udføre en fejlfindende analyse.
Tælle alle mønterne i containeren, der er i den rigtige mønter.
Antallet af mønter, der er i den rigtige mønter, er 10.
Der er 6 mønter, der er i den rigtige mønter.
Antallet af mønter, der er i den rigtige mønter, er 10.
Antallet af mønter, der er i den rigtige mønter, er 10.

Det er ikke sandt, at der er 10 mønter i den rigtige mønter.
Antallet af mønter i den rigtige mønter er 10.
For at bestemme, hvor mange mønter, der er i den rigtige mønter, er det mest almindelige.
Tæll alle mønterne, der er i containeren, der er i den rigtige mønter.
|     | Gold | Silver | Bronze |
| Tom |   9   |   11   |   12   |

In [None]:
print(df.loc[0].assertion_reason)

The output does not provide the correct solution to the problem. It repeats incorrect numbers for the coins and does not solve the logic puzzle based on the condition given (only one pirate tells the truth).
