# Experiment with Results File here while main program is running

In [1]:
import json
import os

output_dir = "../machine_exers/outputs"
results_file = os.path.join(output_dir, "results.json")

# Load the results from the file
with open(results_file, "r") as f:
   results = json.load(f)

# Print the results
no_ans_counter = 0
for question_id, result in results.items():
    if result["message"]:
        no_ans_counter += 1

print(f"Number of questions with no answer: {no_ans_counter}")


Number of questions with no answer: 1105


In [2]:
from datasets import load_dataset

math_dataset_test = load_dataset('competition_math', split='test')

In [3]:
import re 

def find_last_boxed_answer(text):
    # This pattern matches nested braces
    pattern = r'\\boxed{((?:[^{}]|{(?:[^{}]|{[^{}]*})*})*)}'
    boxed_answers = re.findall(pattern, text)

    if len(boxed_answers) == 0:
        return None
    else:
        return boxed_answers[-1]

In [48]:
# Extract boxed answers from the 'solution' feature before evaluation
for index in range(len(math_dataset_test)):
    solution = math_dataset_test[index]['solution']
    boxed_answer_in_solution = find_last_boxed_answer(solution)

    # Print message if no boxed answer is found
    if len(boxed_answer_in_solution) == 0:
        print(f"Index: {index}, Boxed Answers: {boxed_answer_in_solution}")

    results[f'{index}']["problem"] = math_dataset_test[index]['problem']
    results[f'{index}']["solution_answer"] = boxed_answer_in_solution

## Enhance exact match using sympy (NOT FINAL)

https://github.com/openai/simple-evals/blob/main/math_eval.py

In [27]:
import sympy as sp

# Function to judge equality using the template
def judge_equality_llm(expression1, expression2):
    prompt = EQUALITY_TEMPLATE % {"expression1": expression1, "expression2": expression2}
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=10)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    return response

def enhance_exact_match(generated_answer, reference_answer):
    try:
        # Handle intervals
        if "Interval" in generated_answer or "Interval" in reference_answer:
            generated_expr = eval(generated_answer, {"Interval": sp.Interval})
            reference_expr = eval(reference_answer, {"Interval": sp.Interval})
        # Handle sets
        elif "{" in generated_answer or "{" in reference_answer:
            generated_expr = sp.FiniteSet(*sp.sympify(generated_answer))
            reference_expr = sp.FiniteSet(*sp.sympify(reference_answer))
        else:
            # Parse the generated and reference answers into symbolic expressions
            generated_expr = sp.sympify(generated_answer)
            reference_expr = sp.sympify(reference_answer)
        
        
        # Simplify both expressions
        simplified_generated = sp.simplify(generated_expr)
        simplified_reference = sp.simplify(reference_expr)
        
        # Compare the simplified expressions
        if simplified_generated == simplified_reference
            return True
        else:
            # pass to LLM to judge equality
            result = judge_equality_llm(simplified_generated, simplified_reference)
            if result == "Yes":
                print(f"Generated: {simplified_generated}, Reference: {simplified_reference}")
                return True
            else: 
                return False
    
    except (sp.SympifyError, TypeError, AttributeError):
        # If parsing or comparison fails, return False
        return False

In [13]:
%pip uninstall -y antlr4-python3-runtime
%pip install antlr4-python3-runtime==4.11 # for parse_latex of sympy

Found existing installation: antlr4-python3-runtime 4.13.2
Uninstalling antlr4-python3-runtime-4.13.2:
  Successfully uninstalled antlr4-python3-runtime-4.13.2
Note: you may need to restart the kernel to use updated packages.
Collecting antlr4-python3-runtime==4.11
  Downloading antlr4_python3_runtime-4.11.0-py3-none-any.whl.metadata (291 bytes)
Downloading antlr4_python3_runtime-4.11.0-py3-none-any.whl (144 kB)
Installing collected packages: antlr4-python3-runtime
Successfully installed antlr4-python3-runtime-4.11.0
Note: you may need to restart the kernel to use updated packages.


In [5]:
import sympy
import antlr4
import pkg_resources

print(f"Sympy version: {sympy.__version__}")
print(f"ANTLR4 version: {pkg_resources.get_distribution('antlr4-python3-runtime').version}")

Sympy version: 1.13.3
ANTLR4 version: 4.11.0


In [36]:
import sympy as sp
from sympy.parsing.latex import parse_latex
import antlr4

def preprocess_latex_expression(expression):
    if expression is None:
        return None
    
    # # Handle interval notation specifically
    # expression = expression.replace(r'(', r'\left(').replace(r')', r'\right)')
    # expression = expression.replace(r'[', r'\left[').replace(r']', r'\right]')
    # expression = expression.replace(r'\in', 'in')
    # # Handle half-open and half-closed intervals
    # expression = expression.replace(r'\left(', r'\left\langle').replace(r'\right)', r'\right\rangle')
    # expression = expression.replace(r'\left[', r'\left\lbrack').replace(r'\right]', r'\right\rbrack')

    # Handle pmatrix: Convert to valid sympy matrix format
    def convert_pmatrix(match):
        matrix_content = match.group(1)
        # Split rows by '\\' and split elements by '&'
        rows = ['[' + ','.join(row.split('&')) + ']' for row in matrix_content.split(r'\\')]
        # Join rows to form a valid matrix structure
        return f'Matrix([{", ".join(rows)}])'

    # Find pmatrix and replace it with Matrix([...])
    expression = re.sub(r'\\begin\{pmatrix\}(.+?)\\end\{pmatrix\}', convert_pmatrix, expression, flags=re.DOTALL)


    # Automatically handle backslash-escaped symbols (like \%, \$, etc.)
    expression = re.sub(r'\\(.)', r'\1', expression)

    return expression


def enhance_exact_match(generated_answer, reference_answer):
    try:
        # Handle None values
        if generated_answer is None or reference_answer is None:
            return False
        
        # Preprocess LaTeX expressions
        generated_answer = preprocess_latex_expression(generated_answer)
        reference_answer = preprocess_latex_expression(reference_answer)

        # Handle LaTeX parsing
        generated_expr = parse_latex(generated_answer)
        reference_expr = parse_latex(reference_answer)
        
        # Simplify both expressions
        simplified_generated = sp.simplify(generated_expr)
        simplified_reference = sp.simplify(reference_expr)
        
        # Compare the simplified expressions
        if simplified_generated == simplified_reference:
            return True
        else:
            return False
            # Pass to LLM to judge equality
            # result = judge_equality_llm(str(simplified_generated), str(simplified_reference))
            # return True if result == "Yes" else False
    
    except (sp.SympifyError, TypeError, AttributeError, Exception) as e:
        # Print detailed error message
        print(f"Error parsing LaTeX expression: {e}")
        print(f"Generated answer: {generated_answer}")
        print(f"Reference answer: {reference_answer}")
        return False


In [15]:
from sympy.parsing.sympy_parser import parse_expr


In [12]:
correct = 0
for index in range(len(math_dataset_test)):
    generated_answer = results[f'{index}']["generated_answer"]
    reference_answer = results[f'{index}']["solution_answer"]
    
    correct_normal = True if generated_answer == reference_answer else False
    correct_sympy = enhance_exact_match(generated_answer, reference_answer)

    # if not correct_normal == correct_sympy:
    #     print(f"Index: {index}, Generated: {generated_answer}, Reference: {reference_answer}")
    #     print(f"Correct Normal: {correct_normal}, Correct Sympy: {correct_sympy}")

    if correct_sympy or correct_normal:
        correct += 1

print(f"Number of correct answers: {correct}")
print(f"Accuracy: {correct / len(math_dataset_test)}")

Error parsing LaTeX expression: I don't understand this
\$40
^
Generated answer: 20
Reference answer: \$40
Error parsing LaTeX expression: I don't understand this
.5
^
Generated answer: \frac{1}{2}
Reference answer: .5
Error parsing LaTeX expression: I expected one of these: '{' '['
\frac{\sqrt7}{14}
~~~~~~~~~~~^
Generated answer: \frac{\sqrt7}{14}
Reference answer: \frac{\sqrt7}{14}
Error parsing LaTeX expression: I expected one of these: '}'
\frac{\sqrtInterval.Ropen(3){9}}{3}
~~~~~~~~~~~~~~~~~~~^
Generated answer: \frac{\sqrtInterval.Ropen(3){9}}{3}
Reference answer: \frac{\sqrt{3}}{3}
Error parsing LaTeX expression: I don't understand this
\$15,\!000
^
Generated answer: 15000
Reference answer: \$15,\!000
Error parsing LaTeX expression: I don't understand this
\$ 139
^
Generated answer: 126
Reference answer: \$ 139
Error parsing LaTeX expression: I expected one of these: '{' '['
\frac{\sqrt6}3
~~~~~~~~~~~^
Generated answer: 1
Reference answer: \frac{\sqrt6}3
Error parsing LaTeX expr

## Eval Data from Meta

In [38]:
from huggingface_hub import login

hf_token = "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"
login(hf_token)
eval_dataset_FROM_META = load_dataset("meta-llama/Llama-3.2-1B-Instruct-evals", "Llama-3.2-1B-Instruct-evals__math__details")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /data/students/ryan/.cache/huggingface/token
Login successful


In [39]:
eval_dataset_FROM_META_df = eval_dataset_FROM_META['latest'].to_pandas()
eval_dataset_FROM_META_df.head()

Unnamed: 0,task_type,task_name,subtask_name,input_question,input_choice_list,input_final_prompts,input_correct_responses,output_prediction_text,output_parsed_answer,output_choice_completions,output_choice_negative_log_likelihoods,output_metrics,is_correct,input_question_hash,input_final_prompts_hash,benchmark_label,eval_config
0,Generative,math_chat_new,,Find the imaginary part of \[(\cos12^\circ+i\s...,,[<|start_header_id|>user<|end_header_id|>\n\nS...,[0],[## Step 1: Express the given expression in po...,0,,,"{'correct_format': 1.0, 'em': 1.0, 'em_maj1@1'...",True,5a2e4b5ae809e4fe13e8e25fedadc00309a38c663a0872...,[adcde01feac66b5a5a928be8804031ffddcb481ef73e4...,"MATH (4-shot, CoT)","{'max_gen_len': '5120', 'max_prompt_len': '307..."
1,Generative,math_chat_new,,"What is the least, positive four-digit multipl...",,[<|start_header_id|>user<|end_header_id|>\n\nS...,[1001],[## Step 1: Determine the smallest four-digit ...,1007,,,"{'correct_format': 1.0, 'em': 0.0, 'em_maj1@1'...",False,05be5a24c13a7f03934ca04c01bd595e757234538b3118...,[97745be9a6acd60858e258a46fbd4946e2904dd320c6c...,"MATH (4-shot, CoT)","{'max_gen_len': '5120', 'max_prompt_len': '307..."
2,Generative,math_chat_new,,What is $(1 + 2 \cdot 3 \cdot 4 \cdot 5) \div ...,,[<|start_header_id|>user<|end_header_id|>\n\nS...,[11],[## Step 1: Calculate the product inside the p...,21.909...,,,"{'correct_format': 1.0, 'em': 0.0, 'em_maj1@1'...",False,fc91c2faca18e68429a3ad12d1edcaf5eaa84183506aaf...,[2a840b3673983db48b20ed473579b710ad1f111764425...,"MATH (4-shot, CoT)","{'max_gen_len': '5120', 'max_prompt_len': '307..."
3,Generative,math_chat_new,,"Two ropes, 18 meters in length and 24 meters i...",,[<|start_header_id|>user<|end_header_id|>\n\nS...,[6],[## Step 1: Identify the key constraint\nThe p...,1,,,"{'correct_format': 1.0, 'em': 0.0, 'em_maj1@1'...",False,9e87e5f82a292976ff7a0503c803bb2fcbb026b1954d59...,[c7952d5c97db4006ebe8fba59bf41c9caa6dc35c59590...,"MATH (4-shot, CoT)","{'max_gen_len': '5120', 'max_prompt_len': '307..."
4,Generative,math_chat_new,,"In the diagram below, lines $k$ and $\ell$ are...",,[<|start_header_id|>user<|end_header_id|>\n\nS...,[60],[## Step 1: The problem provides a diagram wi...,30,,,"{'correct_format': 1.0, 'em': 0.0, 'em_maj1@1'...",False,30625ff210cf5a08933c65b91f4ff6b36fa1d03087ea1a...,[857c3d7d39e2cef68320fb00d5271b66676ba06244744...,"MATH (4-shot, CoT)","{'max_gen_len': '5120', 'max_prompt_len': '307..."


In [53]:
meta_results_df = eval_dataset_FROM_META_df.loc[:,["input_question","input_correct_responses","output_parsed_answer"]]
meta_results_df.head()

Unnamed: 0,input_question,input_correct_responses,output_parsed_answer
0,Find the imaginary part of \[(\cos12^\circ+i\s...,[0],0
1,"What is the least, positive four-digit multipl...",[1001],1007
2,What is $(1 + 2 \cdot 3 \cdot 4 \cdot 5) \div ...,[11],21.909...
3,"Two ropes, 18 meters in length and 24 meters i...",[6],1
4,"In the diagram below, lines $k$ and $\ell$ are...",[60],30


In [54]:
import pandas as pd

local_results_df = pd.DataFrame.from_dict(results, orient='index').loc[:,["problem","solution_answer","generated_answer"]]
local_results_df.head()

Unnamed: 0,problem,solution_answer,generated_answer
0,How many vertical asymptotes does the graph of...,2,1
1,What is the positive difference between $120\%...,10,1800
2,Find $x$ such that $\lceil x \rceil + x = \dfr...,\dfrac{9}{7},\dfrac{16}{7}
3,Evaluate $i^5+i^{-25}+i^{45}$.,i,0
4,"If $2^8=4^x$, what is the value of $x$?",4,4


In [55]:
merged_df = pd.merge(local_results_df, meta_results_df, left_on='problem', right_on='input_question')
merged_df.head()

Unnamed: 0,problem,solution_answer,generated_answer,input_question,input_correct_responses,output_parsed_answer
0,How many vertical asymptotes does the graph of...,2,1,How many vertical asymptotes does the graph of...,[2],2
1,What is the positive difference between $120\%...,10,1800,What is the positive difference between $120\%...,[10],10
2,Find $x$ such that $\lceil x \rceil + x = \dfr...,\dfrac{9}{7},\dfrac{16}{7},Find $x$ such that $\lceil x \rceil + x = \dfr...,[1.29e+00],1-n
3,Evaluate $i^5+i^{-25}+i^{45}$.,i,0,Evaluate $i^5+i^{-25}+i^{45}$.,[i],1
4,"If $2^8=4^x$, what is the value of $x$?",4,4,"If $2^8=4^x$, what is the value of $x$?",[4],2


In [56]:
merged_df

Unnamed: 0,problem,solution_answer,generated_answer,input_question,input_correct_responses,output_parsed_answer
0,How many vertical asymptotes does the graph of...,2,1,How many vertical asymptotes does the graph of...,[2],2
1,What is the positive difference between $120\%...,10,1800,What is the positive difference between $120\%...,[10],10
2,Find $x$ such that $\lceil x \rceil + x = \dfr...,\dfrac{9}{7},\dfrac{16}{7},Find $x$ such that $\lceil x \rceil + x = \dfr...,[1.29e+00],1-n
3,Evaluate $i^5+i^{-25}+i^{45}$.,i,0,Evaluate $i^5+i^{-25}+i^{45}$.,[i],1
4,"If $2^8=4^x$, what is the value of $x$?",4,4,"If $2^8=4^x$, what is the value of $x$?",[4],2
...,...,...,...,...,...,...
4995,If $\sin x + \cos x = \frac{1}{5}$ and $0 < x ...,-\frac{4}{3},1,If $\sin x + \cos x = \frac{1}{5}$ and $0 < x ...,[-1.33e+00],-6.24e-01
4996,The matrix for projecting onto a certain plane...,\begin{pmatrix} 1 \\ 2 \\ -3 \end{pmatrix},,The matrix for projecting onto a certain plane...,[\begin{pmatrix}1\2\-3\end{pmatrix}],\begin{pmatrix}13/7\-1/7\3/14\end{pmatrix}
4997,"Let $\mathbf{a},$ $\mathbf{b},$ and $\mathbf{c...",-\frac{155}{2},,"Let $\mathbf{a},$ $\mathbf{b},$ and $\mathbf{c...",[-7.75e+01],-7.75e+01
4998,Find the smallest positive integer solution to...,159,3\frac{24}{76}=\frac{9}{19},Find the smallest positive integer solution to...,[159],1


In [None]:
merged_df

In [59]:
math_dataset_test[4997]['solution']

'Since $\\mathbf{a} + \\mathbf{b} + \\mathbf{c} = \\mathbf{0},$\n\\[(\\mathbf{a} + \\mathbf{b} + \\mathbf{c}) \\cdot (\\mathbf{a} + \\mathbf{b} + \\mathbf{c}) = 0.\\]This expands as\n\\[\\mathbf{a} \\cdot \\mathbf{a} + \\mathbf{b} \\cdot \\mathbf{b} + \\mathbf{c} \\cdot \\mathbf{c} + 2 \\mathbf{a} \\cdot \\mathbf{b} + 2 \\mathbf{a} \\cdot \\mathbf{c} + 2 \\mathbf{b} \\cdot \\mathbf{c} = 0.\\]Since $\\mathbf{a} \\cdot \\mathbf{a} = \\|\\mathbf{a}\\|^2 = 25,$ $\\mathbf{b} \\cdot \\mathbf{b} = \\|\\mathbf{b}\\|^2 = 49,$ and $\\mathbf{c} \\cdot \\mathbf{c} = \\|\\mathbf{c}\\|^2 = 81,$\n\\[2(\\mathbf{a} \\cdot \\mathbf{b} + \\mathbf{a} \\cdot \\mathbf{c} + \\mathbf{b} \\cdot \\mathbf{c}) + 155 = 0.\\]Hence, $\\mathbf{a} \\cdot \\mathbf{b} + \\mathbf{a} \\cdot \\mathbf{c} + \\mathbf{b} \\cdot \\mathbf{c} = \\boxed{-\\frac{155}{2}}.$'

## Final Eval

In [67]:
EQUALITY_TEMPLATE = r"""
Look at the following two expressions (answers to a math problem) and judge whether they are equivalent. Only perform trivial simplifications

Examples:

    Expression 1: $2x+3$
    Expression 2: $3+2x$

Yes

    Expression 1: 3/2
    Expression 2: 1.5

Yes

    Expression 1: $x^2+2x+1$
    Expression 2: $y^2+2y+1$

No

    Expression 1: $x^2+2x+1$
    Expression 2: $(x+1)^2$

Yes

    Expression 1: 3245/5
    Expression 2: 649

No
(these are actually equal, don't mark them equivalent if you need to do nontrivial simplifications)

    Expression 1: 2/(-3)
    Expression 2: -2/3

Yes
(trivial simplifications are allowed)

    Expression 1: 72 degrees
    Expression 2: 72

Yes
(give benefit of the doubt to units)

    Expression 1: 64
    Expression 2: 64 square feet

Yes
(give benefit of the doubt to units)

---

YOUR TASK


Respond with only "Yes" or "No" (without quotes). Do not include a rationale.

    Expression 1: %(expression1)s
    Expression 2: %(expression2)s
""".strip()


In [65]:
import torch
from transformers import LlamaForCausalLM, PreTrainedTokenizerFast # load the model and tokenizer

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 
# use a specific GPU
os.environ["CUDA_VISIBLE_DEVICES"]="4"

# Paths to model and tokenizer
model_dir = "../../../../../llm/llama/Llama-3.2-1B-Instruct"


# Use GPU for inference
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print the device being used
print(f"Using device: {device}")

# Load tokenizer
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_dir)

# Set the eos_token as the padding token
tokenizer.pad_token = tokenizer.eos_token

# Load model
model = LlamaForCausalLM.from_pretrained(model_dir, 
                                        #  trust_remote_code=True, 
                                         low_cpu_mem_usage=True, 
                                         torch_dtype='auto'
                                         )

# Move the model to the GPU
model.to(device)

# Set the model to evaluation mode
model.eval()

Using device: cuda


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm):

In [70]:
import sympy as sp

# Function to judge equality using the template
def judge_equality_llm(expression1, expression2):
    prompt = EQUALITY_TEMPLATE % {"expression1": expression1, "expression2": expression2}
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=10)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    return response

def check_match(generated_answer, reference_answer):
    try:
        # Return False if either value is NoneType
        if generated_answer is None or reference_answer is None:
            return False

        # Direct comparison
        if generated_answer == reference_answer:
            return True
        
        # Convert fractions to numeric format
        generated_expr = sp.sympify(generated_answer)
        reference_expr = sp.sympify(reference_answer)
        
        # Simplify both expressions
        simplified_generated = sp.simplify(generated_expr)
        simplified_reference = sp.simplify(reference_expr)
        
        # Compare the simplified expressions
        if simplified_generated == simplified_reference:
            return True
        else:
            # Pass to LLM to judge equality
            result = judge_equality_llm(simplified_generated, simplified_reference)
            return result == "Yes"
    
    except (sp.SympifyError, TypeError, AttributeError):
        # If parsing or comparison fails, we pass to LLM to judge equality
        result = judge_equality_llm(generated_answer, reference_answer)
        return result == "Yes"

In [72]:
import logging

# Set the logging level to ERROR to suppress warnings
logging.getLogger("transformers").setLevel(logging.ERROR)

correct = 0
for index in range(len(math_dataset_test)):
    generated_answer = results[f'{index}']["generated_answer"]
    reference_answer = results[f'{index}']["solution_answer"]
    
    is_correct = check_match(generated_answer, reference_answer)
    correct += is_correct

print(f"Number of correct answers: {correct}")
print(f"Accuracy: {correct / len(math_dataset_test)}")

Number of correct answers: 1052
Accuracy: 0.2104
