In [9]:
import re

def parse_llm_response(text: str) -> str:
    """
    Parse an LLM response and extract the final answer.

    1. If there is a '\\boxed{...}', extract the contents of the braces.
    2. Otherwise, find all numeric tokens (which may include commas or dots)
       and take the last one.
    3. Remove any dollar signs.

    Returns the extracted value as a string, or raises ValueError if nothing
    could be found.
    """
    # 1. Try to find \boxed{...}
    boxed = re.search(r'\\boxed\{([^}]*)\}', text)
    if boxed:
        answer = boxed.group(1)
    else:
        # 2. Find all number-like tokens: sequences of digits with optional
        #    commas or dots (e.g. "1,032.56" or "18")
        nums = re.findall(r'-?\d+(?:[.,]\d+)*', text)
        if not nums:
            raise ValueError("No boxed value or numeric token found in response.")
        answer = nums[-1]

    # 3. Remove any dollar signs and surrounding whitespace
    answer = answer.replace('$', '').strip().replace(',','')
    try:
      answer = float(answer)
    except:
      pass
    return answer



resp = """Okay, let's break down this problem step by step...

Final Answer: \\boxed{18}"""
print(parse_llm_response(resp))   # -> "18"

resp2 = "We work it out, and in the end we get 1,032.56 but"
print(parse_llm_response(resp2))  # -> "1,032.56"


18.0
1032.56


In [1]:
import os
os.listdir('gsm8k')

['allam_samples_gsm8k_boxed_2025-06-09T13-10-20.997147.jsonl',
 'allam_samples_gsm8k_darija_boxed_2025-06-09T12-48-05.709892.jsonl',
 'atlas27b_samples_gsm8k_boxed_2025-06-01T03-07-46.691518.jsonl',
 'atlas27b_samples_gsm8k_darija_boxed_2025-06-01T02-17-44.558714.jsonl',
 'atlas9b_samples_gsm8k_boxed_2025-06-08T15-18-05.933509.jsonl',
 'atlas9b_samples_gsm8k_darija_boxed_2025-06-08T14-22-08.188197.jsonl',
 'gemma327b_samples_gsm8k_boxed_2025-06-01T21-08-43.631813.jsonl',
 'gemma327b_samples_gsm8k_darija_boxed_2025-06-01T19-15-28.814912.jsonl',
 'gemmaroc_samples_gsm8k_boxed_2025-06-01T05-30-24.708510.jsonl',
 'gemmaroc_samples_gsm8k_darija_boxed_2025-06-01T03-40-16.247174.jsonl']

In [10]:
import json
for filename in os.listdir('gsm8k'):
    filepath = os.path.join('gsm8k', filename)
    print(filename)
    ground_truths = []
    llm_responses = []
    with open(filepath, encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            ground_truths.append(obj["target"])
            llm_responses.append(obj["resps"][0][0])

    parsed_g = []
    parsed_res = []
    for i in range(len(ground_truths)):
        parsed_g.append(parse_llm_response(ground_truths[i]))
        #parsed_g.append(ground_truths[i])
        try:
            parsed_res.append(parse_llm_response(llm_responses[i]))
        except:
            parsed_res.append('[invalid]')
    identic = [i for i in range(len(parsed_g)) if parsed_g[i] == parsed_res[i]]
    non_identifc = [i for i in range(len(parsed_g)) if parsed_g[i] != parsed_res[i]]
    parsed_g_non_identifc = [parsed_g[i] for i in non_identifc]
    parsed_res_non_identifc = [parsed_res[i] for i in non_identifc]
    ground_truths_non_identifc = [ground_truths[i] for i in non_identifc]
    llm_responses_non_identifc = [llm_responses[i] for i in non_identifc]

    print('nb of invalid parsing: '+str(parsed_res.count('[invalid]')))
    print("score: "+str(len(identic)/len(parsed_g)))
    print('--------')
    

allam_samples_gsm8k_boxed_2025-06-09T13-10-20.997147.jsonl
nb of invalid parsing: 0
score: 0.686125852918878
--------
allam_samples_gsm8k_darija_boxed_2025-06-09T12-48-05.709892.jsonl
nb of invalid parsing: 0
score: 0.40333586050037906
--------
atlas27b_samples_gsm8k_boxed_2025-06-01T03-07-46.691518.jsonl
nb of invalid parsing: 0
score: 0.8233510235026535
--------
atlas27b_samples_gsm8k_darija_boxed_2025-06-01T02-17-44.558714.jsonl
nb of invalid parsing: 0
score: 0.7103866565579985
--------
atlas9b_samples_gsm8k_boxed_2025-06-08T15-18-05.933509.jsonl
nb of invalid parsing: 0
score: 0.7702805155420773
--------
atlas9b_samples_gsm8k_darija_boxed_2025-06-08T14-22-08.188197.jsonl
nb of invalid parsing: 0
score: 0.6679302501895376
--------
gemma327b_samples_gsm8k_boxed_2025-06-01T21-08-43.631813.jsonl
nb of invalid parsing: 0
score: 0.956027293404094
--------
gemma327b_samples_gsm8k_darija_boxed_2025-06-01T19-15-28.814912.jsonl
nb of invalid parsing: 0
score: 0.8271417740712661
--------
gem