# Llama 3.1 70B Multiple Choice, no choices

In [1]:
import sys
from pathlib import Path

BASEDIR = Path("/workspaces/HARP/") / "src"  # Replace with your own basedir path for the repo

sys.path.insert(0, str(BASEDIR))

In [2]:
from __future__ import annotations

import copy
import itertools
import json
import math
import os
import pickle
import pprint
import re
import textwrap
import time
import traceback
from collections import Counter, defaultdict
from typing import Any

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tiktoken
from IPython.display import Markdown, clear_output, display
from tqdm.auto import tqdm

In [3]:
from eval.costs import count_tokens, get_pricing
from eval.eval import run_one, create_batch, make_answer_check_dict_from_jsonl, make_results_df, accuracy_by_split
from eval.parsing_lib import *
from eval.latex_answer_check import *
from eval.response import ModelResponse
from eval.utils import read_jsonl, write_jsonl, get_uid

# Data

In [4]:
# We don't use HARP_mcq.jsonl here because we used the original ordering of answer choices
dataset = [x for x in read_jsonl(BASEDIR / "data/processed/HARP_raw.jsonl") if x["choices"] is not None and x["subject"] != "calculus"]
dataset_map = {get_uid(p): p for p in dataset}
len(dataset)

4110

# Results

```
python run_eval.py --model meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo --api together --out hard.jsonl --temperature 0 --max-tokens 2048 --do-multiple-choice
```

In [5]:
fname = BASEDIR / "outputs/mcq/Meta-Llama-3.1-70B-Instruct-Turbo/no-choices.jsonl"

responses = read_jsonl(fname)
responses = [
    {
        "uid": x["uid"],
        "system": x["system"],
        "prompt": x["prompt"],
        "response": ModelResponse.from_response(x["response"], "together")
    }
    for x in responses
    if x["uid"] in dataset_map
]
response_map = {o["uid"]: o for o in responses}

answer_check_dicts = make_answer_check_dict_from_jsonl(responses, dataset_map)

In [6]:
llama_wanted_choices = []

answer_check_results = []
for prob in tqdm(answer_check_dicts):
    model_ans = prob["generated_text"] if prob["finish_reason"] == "stop" else None
    gt = prob["answer_choice"]
    out = check_one_latex_answer(
        model_ans,
        gt,
        extract_policy="flex",
        eval_policy="aggressive",
    )

    model_ans_choice = extract_answer(model_ans, EXTRACT_RE_PATTERNS) if model_ans is not None else "F"
    if model_ans_choice is not None:
        model_ans_choice = remove_boxes_keep_content(clean_answer(model_ans_choice))
        gt_choice = remove_boxes_keep_content(clean_answer(gt))
        if out["is_correct"] != (model_ans_choice == gt_choice):
            print("DIDNT MATCH EXACT MATCH:", out)
    else:
        llama_wanted_choices.append(get_uid(prob))
    
    answer_check_results.append({**prob, **out})

sum([x["is_correct"] for x in answer_check_results]) / len(dataset)

  0%|          | 0/4110 [00:00<?, ?it/s]

0.18953771289537713

In [7]:
results_metadata = make_results_df(responses, answer_check_results, dataset_map, mode="mcq")

In [8]:
os.makedirs(BASEDIR / "results/Meta-Llama-3.1-70B-Instruct-Turbo", exist_ok=True)
results_metadata.to_csv(BASEDIR / "results/Meta-Llama-3.1-70B-Instruct-Turbo/results_mcq_nochoices.csv")

In [24]:
results_metadata = pd.read_csv(BASEDIR / "results/Meta-Llama-3.1-70B-Instruct-Turbo/results_mcq_nochoices.csv").set_index("uid")

In [10]:
results_metadata.value_counts("reason")

reason
stop      3558
length     552
Name: count, dtype: int64

In [12]:
1 - results_metadata.value_counts("reason").loc["stop"] / len(dataset)

np.float64(0.13430656934306573)

## Cost

In [13]:
PRICES = get_pricing("meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo")

total_cost = 0
for row in results_metadata.itertuples():
    total_cost += PRICES["input_tokens"] * row.input_tokens
    total_cost += PRICES["output_tokens"] * row.output_tokens
total_cost

3.0315234399999738

## Accuracy

In [14]:
accuracy_by_split(results_metadata, "level")

Unnamed: 0_level_0,count,accuracy
level,Unnamed: 1_level_1,Unnamed: 2_level_1
1,858,19.58042
2,1612,22.084367
3,1136,15.140845
4,504,16.468254


In [15]:
accuracy_by_split(results_metadata, "subject")

Unnamed: 0_level_0,count,accuracy
subject,Unnamed: 1_level_1,Unnamed: 2_level_1
algebra,872,18.119266
counting_and_probability,600,19.666667
geometry,1071,17.180205
number_theory,473,15.856237
prealgebra,957,22.884013
precalculus,137,18.248175


**Compare overlap with Short Ans prompt**

In [16]:
results_metadata[~results_metadata["mcq_only"]]["is_correct"].mean() * 100

np.float64(19.515406900184356)

In [17]:
accuracy_by_split(results_metadata[~results_metadata["mcq_only"]], "level")

Unnamed: 0_level_0,count,accuracy
level,Unnamed: 1_level_1,Unnamed: 2_level_1
1,799,19.524406
2,1505,22.657807
3,1036,15.830116
4,457,17.50547


## Asking for choices

One thing I noticed is that Llama will sometimes notice that a math problem is incomplete, and refuse to fake-solve and give an answer! This is however not always right, e.g. 1951/AHSME/1

In [18]:
llama_wanted_choices[0]

'1951/AHSME/1'

In [20]:
len(llama_wanted_choices)

24

In [19]:
for p in llama_wanted_choices:
    if not dataset_map[p]["multiple_choice_only"]:
        print(dataset_map[p])

{'year': '1951', 'contest': 'AHSME', 'number': 1, 'url': 'https://artofproblemsolving.com/wiki/index.php/1951_AHSME_Problems/Problem_1', 'level': 2, 'subject': 'prealgebra', 'multiple_choice_only': False, 'full_text': '# Problem\nThe percent that $M$ is greater than $N$ is:\n$(\\mathrm{A})\\ \\frac{100(M-N)}{M} \\qquad (\\mathrm{B})\\ \\frac{100(M-N)}{N} \\qquad (\\mathrm{C})\\ \\frac{M-N}{N} \\qquad (\\mathrm{D})\\ \\frac{M-N}{M} \\qquad (\\mathrm{E})\\ \\frac{100(M+N)}{N}$\n\n# Solution\n$M-N$ is the amount by which $M$ is greater than $N$. We divide this by $N$ to get the percent by which $N$ is increased in the form of a decimal, and then multiply by $100$ to make it a percentage. Therefore, the answer is $\\boxed{\\mathrm{(B)}\\ \\dfrac{100(M-N)}{N}}$.\n', 'num_gpt4_tokens': 192, 'choices': {'A': '$\\frac{100(M-N)}{M}$', 'B': '$\\frac{100(M-N)}{N}$', 'C': '$\\frac{M-N}{N}$', 'D': '$\\frac{M-N}{M}$', 'E': '$\\frac{100(M+N)}{N}$'}, 'problem': 'The percent that $M$ is greater than $N

## Look at responses

It seems like the model doesn't have the answer choices memorized, as it often gets the right answer but the wrong answer choice. In total, the accuracy is only as good as random chance too.

In [21]:
answer_check_results_map = {
    answer_check_dicts[i]["uid"]:
    {
        "uid": answer_check_dicts[i]["uid"],
        "url": dataset_map[answer_check_dicts[i]["uid"]]["url"],
        "level": dataset_map[answer_check_dicts[i]["uid"]]["level"],
        "subject": dataset_map[answer_check_dicts[i]["uid"]]["subject"],
        "problem": answer_check_dicts[i]["problem"],
        **x
    } 
    for i, x in enumerate(answer_check_results) 
    # if not x["is_correct"] and responses[i]["response"].completions[0].finish_reason == "stop"
}

In [25]:
rand_uid = np.random.choice(results_metadata.query("reason == 'stop'").index)
print(rand_uid)
print(dataset_map[rand_uid]["answer_choice"])
print(dataset_map[rand_uid]["answer"])
print(dataset_map[rand_uid]["choices"])
print(response_map[rand_uid]["prompt"][-1]["content"])
display(Markdown(response_map[rand_uid]["response"].completions[0].completion))

1965/AHSME/23
D
$.0401$
{'A': '$.0301$', 'B': '$.0349$', 'C': '$.0399$', 'D': '$.0401$', 'E': '$.0499$'}
Solve the following math problem efficiently and clearly:

- For simple problems (2 steps or fewer):
Provide a concise solution with minimal explanation.

- For complex problems (3 steps or more):
Use this step-by-step format:

## Step 1: [Concise description]
[Brief explanation and calculations]

## Step 2: [Concise description]
[Brief explanation and calculations]

...

Regardless of the approach, always conclude with:

Therefore, the final answer is: ?. I hope it is correct.

Where ? is replaced by one of the letters A, B, C, D or E.

Problem: If we write $|x^2 - 4| < N$ for all $x$ such that $|x - 2| < 0.01$, the smallest value we can use for $N$ is:



## Step 1: Understand the given inequality
We are given the inequality $|x^2 - 4| < N$ for all $x$ such that $|x - 2| < 0.01$. This means we need to find the smallest possible value of $N$ that satisfies this condition.

## Step 2: Analyze the range of x values
Since $|x - 2| < 0.01$, we can rewrite this as $1.99 < x < 2.01$. This gives us the range of $x$ values that we need to consider.

## Step 3: Find the maximum value of $|x^2 - 4|$ in the given range
To find the smallest possible value of $N$, we need to find the maximum value of $|x^2 - 4|$ in the given range of $x$ values. Since $x^2 - 4$ is a quadratic function, its maximum value in the given range will occur at one of the endpoints.

## Step 4: Calculate the maximum value of $|x^2 - 4|$
Let's calculate the value of $|x^2 - 4|$ at the endpoints of the range:
- At $x = 1.99$, $|x^2 - 4| = |(1.99)^2 - 4| = |3.9601 - 4| = 0.0399$
- At $x = 2.01$, $|x^2 - 4| = |(2.01)^2 - 4| = |4.0401 - 4| = 0.0401$

## Step 5: Determine the smallest possible value of N
The maximum value of $|x^2 - 4|$ in the given range is 0.0401. Therefore, the smallest possible value of $N$ is 0.0401.

Therefore, the final answer is: A. 