In [None]:
from pathlib import Path

import argparse
import pandas as pd
import torch

import outlines
from outlines.types import (
    Regex, String, whitespace, named_group, one_or_more, digit, optional, sequence, any_char, zero_or_more
)
from pydantic import BaseModel
from pydantic_evals import Dataset
from pydantic_evals.evaluators import Evaluator, EvaluatorContext

from transformers import AutoTokenizer, AutoModelForCausalLM

from m_gsm_symbolic.kaenguruen.load_data import load_kaenguruen
from m_gsm_symbolic.load_data import load_gsm_dan, load_gsm_eng


if torch.cuda.is_available():
    dev = "cuda"
else:
    dev = "cpu"
device = torch.device(dev)


ImportError: cannot import name 'literal' from 'outlines.types' (/home/au338890/repos/m-gsm-symbolic/.venv/lib/python3.12/site-packages/outlines/types/__init__.py)

In [2]:
import nest_asyncio

nest_asyncio.apply()

In [3]:
from dotenv import load_dotenv
import os

#load_dotenv()
#api_key = os.getenv("OPENAI_API_KEY")


from huggingface_hub import login
#login

In [None]:
#judge_llm_name = "openai:gpt-4o-2024-08-06"
response_model_name = "meta-llama/Llama-3.2-1B"

In [None]:

# 1. Outlines regex DSL: reasoning (any text) + '####' + answer (int)
answer_pattern = to_regex(
    zero_or_more(Regex(r"(?s)(?:.*?=<<[^<>]+=[^<>]+>>\d+\.*\d*\s*)+") +
    String("####") +
    whitespace() +
    one_or_more(digit())
)
# Support float or frac as well
    #sequence(
    #    one_or_more(digit()),
    #    optional(either(
    #        sequence(literal("."), one_or_more(digit())),
    #        sequence(literal("/"), one_or_more(digit()))
    #    ))
    #)

class HuggingFaceAgent:
    def __init__(self, model: str, examples: list):
        self.tokenizer = AutoTokenizer.from_pretrained(model)
        self.model = outlines.from_transformers(AutoModelForCausalLM.from_pretrained(model, torch_dtype="auto").to(device), self.tokenizer)

    def _build_prompt(self, prompt):
        examples = []
        for case in cases:
            example = f"Problem: {case.inputs}\n\nSolution: {case.expected_output}"
            examples.append(example)

        prompt = f"Problem: {prompt}\n\nSolution:"
        examples.append(prompt)
        prompt = "\n\n".join(examples)
        return prompt

    def run(self, prompt: str):
        prompt = self._build_prompt(prompt)
        model_input = self.tokenizer(prompt, return_tensors="pt").to(device)

        model_input.pop("token_type_ids", None)

        input_len = model_input["input_ids"].shape[-1]

        model_output = self.model(prompt, answer_pattern, max_new_tokens=500)
        del model_input
        model_output = model_output[0][input_len:]

        output_decoded = self.tokenizer.decode(model_output, skip_special_tokens=True)
        del model_output
        return output_decoded


In [None]:
cases = [p.to_case() for p in load_gsm_eng()]

response_model_name = "meta-llama/Llama-3.2-1B"
agent_evaluated = HuggingFaceAgent(response_model_name, examples=cases[-3:])

# Custom evaluator: compare only the answer after '####'
@dataclass
class AnswerOnlyMatch(Evaluator):
    def evaluate(self, ctx: EvaluatorContext) -> bool:
        # Extract answer after '####' using regex
        m_pred = re.search(r"####\s*(\d+)", ctx.output)
        m_true = re.search(r"####\s*(\d+)", ctx.expected_output)
        if not m_pred or not m_true:
            return False
        return int(m_pred.group(1)) == int(m_true.group(1))
    
ds = Dataset(
    cases=cases[:2],
    evaluators=[AnswerOnlyMatch()],
)

async def answer_question(question: str) -> str:
    r = agent_evaluated.run(question)
    return r

report = ds.evaluate_sync(answer_question)
report.print(include_input=True, include_output=True, include_expected_output=True)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  + Exception Group Traceback (most recent call last):
  |   File "/home/au338890/repos/m-gsm-symbolic/.venv/lib/python3.13/site-packages/IPython/core/interactiveshell.py", line 3672, in run_code
  |     exec(code_obj, self.user_global_ns, self.user_ns)
  |     ~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  |   File "/tmp/ipykernel_10327/1909619641.py", line 23, in <module>
  |     report = ds.evaluate_sync(answer_question)
  |   File "/home/au338890/repos/m-gsm-symbolic/.venv/lib/python3.13/site-packages/pydantic_evals/dataset.py", line 315, in evaluate_sync
  |     return get_event_loop().run_until_complete(self.evaluate(task, name=name, max_concurrency=max_concurrency))
  |            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  |   File "/home/au338890/repos/m-gsm-symbolic/.ve

In [10]:
model.device
model.dtype


torch.bfloat16

In [None]:
device = "cuda"
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", torch_dtype="auto").to(device)

In [18]:
cases[:1]

[Case(name='728', inputs='Candy has 15 light blue spools of thread, 45 dark blue spools of thread, 40 light green spools of thread, and 50 dark green spools of thread. What percent of her spools are blue?', metadata={'filepath': '/home/au338890/repos/m-gsm-symbolic/data/templates/eng/symbolic/0066.json'}, expected_output='First find the number of blue spools: 15 spools + 45 spools = <<15+45=60>>60 spools\nThen find the total number of spools: 40 spools + 50 spools + 60 spools = <<40+50+60=150>>150 spools\nThen divide the number of blue spools by the total number of spools and multiply by 100% to express the answer as a percentage: 60 spools / 150 spools * 100% = 40%\n#### 40', evaluators=[])]

In [19]:
agent_evaluated = HuggingFaceAgent(response_model_name, examples=cases[-1:])

In [20]:

async def answer_question(question: str) -> str:
    r = agent_evaluated.run(question)
    return r

report_2 = ds.evaluate_sync(answer_question)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  + Exception Group Traceback (most recent call last):
  |   File "/home/au338890/repos/m-gsm-symbolic/.venv/lib/python3.13/site-packages/IPython/core/interactiveshell.py", line 3672, in run_code
  |     exec(code_obj, self.user_global_ns, self.user_ns)
  |     ~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  |   File "/tmp/ipykernel_10327/1812324380.py", line 5, in <module>
  |     report_2 = ds.evaluate_sync(answer_question)
  |   File "/home/au338890/repos/m-gsm-symbolic/.venv/lib/python3.13/site-packages/pydantic_evals/dataset.py", line 315, in evaluate_sync
  |     return get_event_loop().run_until_complete(self.evaluate(task, name=name, max_concurrency=max_concurrency))
  |            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  |   File "/home/au338890/repos/m-gsm-symbolic/.venv/lib/python3.13/site-packages/nest_asyncio.py", line 98, in run_until_

In [14]:
report_2.print(include_input=True, include_output=True, include_expected_output=True)

NameError: name 'report_2' is not defined

In [None]:
print(df.loc[0].output)



Hvis hver pirat svarede sandt til den givne opgave, skal et antal mønter, der er tilfældigt resulterer i et antal mønter, der er lik 30.
I et ufuldstændigt skattecontainer, skal du vurdere, hvor mange mønter, der er i containeren, der er i den rigtige mønter.
Antallet af mønter, der er i den rigtige mønter, er 10.

Før du tager beslutningen, skal du udføre en fejlfindende analyse.
Tælle alle mønterne i containeren, der er i den rigtige mønter.
Antallet af mønter, der er i den rigtige mønter, er 10.
Der er 6 mønter, der er i den rigtige mønter.
Antallet af mønter, der er i den rigtige mønter, er 10.
Antallet af mønter, der er i den rigtige mønter, er 10.

Det er ikke sandt, at der er 10 mønter i den rigtige mønter.
Antallet af mønter i den rigtige mønter er 10.
For at bestemme, hvor mange mønter, der er i den rigtige mønter, er det mest almindelige.
Tæll alle mønterne, der er i containeren, der er i den rigtige mønter.
|     | Gold | Silver | Bronze |
| Tom |   9   |   11   |   12   |

In [None]:
print(df.loc[0].assertion_reason)

The output does not provide the correct solution to the problem. It repeats incorrect numbers for the coins and does not solve the logic puzzle based on the condition given (only one pirate tells the truth).
