In [10]:
from pathlib import Path

import argparse
import pandas as pd
import torch

import outlines
from outlines.types.dsl import Regex, String, zero_or_more, one_or_more, at_most, either, to_regex, optional
from outlines.types import whitespace, digit

from pydantic import BaseModel
from pydantic_evals import Dataset
from pydantic_evals.evaluators import Evaluator, EvaluatorContext

from transformers import AutoTokenizer, AutoModelForCausalLM

from m_gsm_symbolic.kaenguruen.load_data import load_kaenguruen
from m_gsm_symbolic.load_data import load_gsm_dan, load_gsm_eng


if torch.cuda.is_available():
    dev = "cuda"
else:
    dev = "cpu"
device = torch.device(dev)


In [2]:
import nest_asyncio

nest_asyncio.apply()

In [3]:
from dotenv import load_dotenv
import os

#load_dotenv()
#api_key = os.getenv("OPENAI_API_KEY")


from huggingface_hub import login
#login

In [None]:
#judge_llm_name = "openai:gpt-4o-2024-08-06"
response_model_name = "meta-llama/Llama-3.2-1B"

In [None]:

# 1. Outlines regex DSL: reasoning (any text) + '####' + answer (int)
answer_pattern = to_regex(
    at_most(Regex(r"."), 500) +
    String("#### ") +
    one_or_more(digit()) +
    optional(
        either(
            literal("."), literal(",")
        ) + optional(
            one_or_more(digit())
        )
    )
)
# Support float or frac as well
    #sequence(
    #    one_or_more(digit()),
    #    optional(either(
    #        sequence(literal("."), one_or_more(digit())),
    #        sequence(literal("/"), one_or_more(digit()))
    #    ))
    #)

class HuggingFaceAgent:
    def __init__(self, model: str, examples: list, 
                 answer_pattern: Regex = answer_pattern):
        self.tokenizer = AutoTokenizer.from_pretrained(model)
        self.model = outlines.from_transformers(AutoModelForCausalLM.from_pretrained(model, torch_dtype="auto").to(device), self.tokenizer)
        self.cases = examples
        self.answer_pattern = answer_pattern

    def _build_prompt(self, prompt):
        examples = []
        for case in self.cases:
            example = f"Problem: {case.inputs}\n\nSolution: {case.expected_output}"
            examples.append(example)

        prompt = f"Problem: {prompt}\n\nSolution:"
        examples.append(prompt)
        prompt = "\n\n".join(examples)
        return prompt

    def run(self, prompt: str):
        prompt = self._build_prompt(prompt)
        model_input = self.tokenizer(prompt, return_tensors="pt").to(device)

        model_input.pop("token_type_ids", None)

        input_len = model_input["input_ids"].shape[-1]

        model_output = self.model(prompt, self.answer_pattern, max_new_tokens=500)
        del model_input
        model_output = model_output[0][input_len:]

        output_decoded = self.tokenizer.decode(model_output, skip_special_tokens=True)
        del model_output
        return output_decoded


In [15]:

cases = [p.to_case() for p in load_gsm_eng()]

for i, case in enumerate(cases):
    print(f"Case {i}: {re.search(r'####\s*((\d|,)+)', case.expected_output).group(1).replace(',', '')}")

Case 0: 40
Case 1: 58
Case 2: 30
Case 3: 38
Case 4: 56
Case 5: 4
Case 6: 18
Case 7: 54
Case 8: 210
Case 9: 1050
Case 10: 8
Case 11: 4
Case 12: 5
Case 13: 55
Case 14: 2
Case 15: 11
Case 16: 77
Case 17: 118000
Case 18: 16
Case 19: 540
Case 20: 35
Case 21: 43200
Case 22: 64
Case 23: 12
Case 24: 88
Case 25: 10
Case 26: 130
Case 27: 342
Case 28: 70
Case 29: 320
Case 30: 157
Case 31: 2
Case 32: 25
Case 33: 72
Case 34: 75
Case 35: 140
Case 36: 16
Case 37: 14400
Case 38: 2
Case 39: 3200
Case 40: 100
Case 41: 5
Case 42: 14
Case 43: 4
Case 44: 63
Case 45: 2
Case 46: 15
Case 47: 200
Case 48: 4
Case 49: 694
Case 50: 4000
Case 51: 6
Case 52: 9
Case 53: 159
Case 54: 48
Case 55: 45
Case 56: 100
Case 57: 3
Case 58: 42
Case 59: 48
Case 60: 428
Case 61: 25
Case 62: 75
Case 63: 10
Case 64: 3430
Case 65: 18
Case 66: 78
Case 67: 16
Case 68: 8
Case 69: 49
Case 70: 3
Case 71: 20
Case 72: 34
Case 73: 36
Case 74: 48
Case 75: 1248
Case 76: 9500
Case 77: 16
Case 78: 2
Case 79: 36
Case 80: 98
Case 81: 30
Case 82:

In [None]:
cases = [p.to_case() for p in load_gsm_eng()]

response_model_name = "meta-llama/Llama-3.2-1B"
agent_evaluated = HuggingFaceAgent(response_model_name, examples=cases[-3:], answer_pattern=answer_pattern)

# Custom evaluator: compare only the answer after '####'
@dataclass
class AnswerOnlyMatch(Evaluator):
    def evaluate(self, ctx: EvaluatorContext) -> bool:
        # Extract answer after '####' using regex
        m_pred = re.search(r"####\s*((\d|,)+)", ctx.output)
        m_true = re.search(r"####\s*((\d|,)+)", ctx.expected_output)
        if not m_pred or not m_true:
            return False
        return float(m_pred.group(1).replace(",","")) == float(m_true.group(1).replace(",",""))

ds = Dataset(
    cases=cases[:2],
    evaluators=[AnswerOnlyMatch()],
)

async def answer_question(question: str) -> str:
    r = agent_evaluated.run(question)
    return r

report = ds.evaluate_sync(answer_question)
report.print(include_input=True, include_output=True, include_expected_output=True)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  + Exception Group Traceback (most recent call last):
  |   File "/home/au338890/repos/m-gsm-symbolic/.venv/lib/python3.13/site-packages/IPython/core/interactiveshell.py", line 3672, in run_code
  |     exec(code_obj, self.user_global_ns, self.user_ns)
  |     ~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  |   File "/tmp/ipykernel_10327/1909619641.py", line 23, in <module>
  |     report = ds.evaluate_sync(answer_question)
  |   File "/home/au338890/repos/m-gsm-symbolic/.venv/lib/python3.13/site-packages/pydantic_evals/dataset.py", line 315, in evaluate_sync
  |     return get_event_loop().run_until_complete(self.evaluate(task, name=name, max_concurrency=max_concurrency))
  |            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  |   File "/home/au338890/repos/m-gsm-symbolic/.ve

In [10]:
model.device
model.dtype


torch.bfloat16

In [None]:
device = "cuda"
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", torch_dtype="auto").to(device)

In [18]:
cases[:1]

[Case(name='728', inputs='Candy has 15 light blue spools of thread, 45 dark blue spools of thread, 40 light green spools of thread, and 50 dark green spools of thread. What percent of her spools are blue?', metadata={'filepath': '/home/au338890/repos/m-gsm-symbolic/data/templates/eng/symbolic/0066.json'}, expected_output='First find the number of blue spools: 15 spools + 45 spools = <<15+45=60>>60 spools\nThen find the total number of spools: 40 spools + 50 spools + 60 spools = <<40+50+60=150>>150 spools\nThen divide the number of blue spools by the total number of spools and multiply by 100% to express the answer as a percentage: 60 spools / 150 spools * 100% = 40%\n#### 40', evaluators=[])]

In [19]:
agent_evaluated = HuggingFaceAgent(response_model_name, examples=cases[-1:])

In [20]:

async def answer_question(question: str) -> str:
    r = agent_evaluated.run(question)
    return r

report_2 = ds.evaluate_sync(answer_question)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  + Exception Group Traceback (most recent call last):
  |   File "/home/au338890/repos/m-gsm-symbolic/.venv/lib/python3.13/site-packages/IPython/core/interactiveshell.py", line 3672, in run_code
  |     exec(code_obj, self.user_global_ns, self.user_ns)
  |     ~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  |   File "/tmp/ipykernel_10327/1812324380.py", line 5, in <module>
  |     report_2 = ds.evaluate_sync(answer_question)
  |   File "/home/au338890/repos/m-gsm-symbolic/.venv/lib/python3.13/site-packages/pydantic_evals/dataset.py", line 315, in evaluate_sync
  |     return get_event_loop().run_until_complete(self.evaluate(task, name=name, max_concurrency=max_concurrency))
  |            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  |   File "/home/au338890/repos/m-gsm-symbolic/.venv/lib/python3.13/site-packages/nest_asyncio.py", line 98, in run_until_

In [14]:
report_2.print(include_input=True, include_output=True, include_expected_output=True)

NameError: name 'report_2' is not defined

In [None]:
print(df.loc[0].output)



Hvis hver pirat svarede sandt til den givne opgave, skal et antal mønter, der er tilfældigt resulterer i et antal mønter, der er lik 30.
I et ufuldstændigt skattecontainer, skal du vurdere, hvor mange mønter, der er i containeren, der er i den rigtige mønter.
Antallet af mønter, der er i den rigtige mønter, er 10.

Før du tager beslutningen, skal du udføre en fejlfindende analyse.
Tælle alle mønterne i containeren, der er i den rigtige mønter.
Antallet af mønter, der er i den rigtige mønter, er 10.
Der er 6 mønter, der er i den rigtige mønter.
Antallet af mønter, der er i den rigtige mønter, er 10.
Antallet af mønter, der er i den rigtige mønter, er 10.

Det er ikke sandt, at der er 10 mønter i den rigtige mønter.
Antallet af mønter i den rigtige mønter er 10.
For at bestemme, hvor mange mønter, der er i den rigtige mønter, er det mest almindelige.
Tæll alle mønterne, der er i containeren, der er i den rigtige mønter.
|     | Gold | Silver | Bronze |
| Tom |   9   |   11   |   12   |

In [None]:
print(df.loc[0].assertion_reason)

The output does not provide the correct solution to the problem. It repeats incorrect numbers for the coins and does not solve the logic puzzle based on the condition given (only one pirate tells the truth).
