In [1]:
import sys
from pathlib import Path

BASEDIR = Path("/workspaces/HARP/") / "src"  # Replace with your own basedir path for the repo

sys.path.insert(0, str(BASEDIR))

In [2]:
from __future__ import annotations

import copy
import itertools
import json
import math
import os
import pickle
import pprint
import re
import textwrap
import time
import traceback
from collections import Counter, defaultdict
from typing import Any

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tiktoken
from IPython.display import Markdown, clear_output, display
from tqdm.auto import tqdm

import vertexai
from vertexai.batch_prediction._batch_prediction import BatchPredictionJob

In [3]:
from eval.costs import count_tokens, get_pricing
from eval.eval import run_one, create_batch, make_answer_check_dict_from_jsonl, make_results_df, accuracy_by_split
from eval.parsing_lib import *
from eval.latex_answer_check import *
from eval.response import ModelResponse
from eval.utils import read_jsonl, write_jsonl, get_uid

# Data

In [4]:
dataset = read_jsonl(BASEDIR / "data/processed/HARP.jsonl")
dataset_map = {get_uid(p): p for p in dataset}
len(dataset)

4780

# Results

What I ran for Flash
```
# minerva prompt
python run_eval.py --model gemini-1.5-flash-002 --api google --out outputs_minerva.jsonl --temperature 0 --max-tokens 2048 --use-minerva

# zeroshot prompt
python run_eval.py --model gemini-1.5-flash-002 --api google --out outputs.jsonl --temperature 0 --max-tokens 2048
```

In [5]:
fname = BASEDIR / "outputs/short_answer/gemini-1.5-flash-002/outputs_minerva.jsonl"

raw_responses = read_jsonl(fname)
responses = [
    {
        "uid": x["uid"],
        "system": x["system"],
        "prompt": x["prompt"],
        "response": ModelResponse.from_response(x["response"], "google")
    }
    for x in raw_responses
    if x["uid"] in dataset_map
]
raw_response_map = {x["uid"]: x for x in raw_responses if x["uid"] in dataset_map}
response_map = {o["uid"]: o for o in responses}

answer_check_dicts = make_answer_check_dict_from_jsonl(responses, dataset_map)
answer_check_results = latex_answer_check(answer_check_dicts, use_tqdm=True)
results_metadata = make_results_df(responses, answer_check_results, dataset_map, mode="shortans")

sum([x["is_correct"] for x in answer_check_results]) / len(dataset) * 100

  0%|          | 0/4780 [00:00<?, ?it/s]

Function timed out after 10 seconds
('Let the given expression be denoted by $f(x)$. For $f(x)$ to be defined, we must have\n\\begin{align*} \\label{eq:1}\\log_{2001} x &> 0 \\\\ \\log_{2002} (\\log_{2001} x) &> 0 \\\\ \\log_{2003} (\\log_{2002} (\\log_{2001} x)) &> 0 \\\\ \\log_{2004} (\\log_{2003} (\\log_{2002} (\\log_{2001} x))) &> 0 \\end{align*} \nThe first inequality gives $x > 2001^0 = 1$.\nThe second inequality gives $\\log_{2001} x > 2002^0 = 1$, so $x > 2001^1 = 2001$.\nThe third inequality gives $\\log_{2002} (\\log_{2001} x) > 2003^0 = 1$, so $\\log_{2001} x > 2002^1 = 2002$, which means $x > 2001^{2002}$.\nThe fourth inequality gives $\\log_{2003} (\\log_{2002} (\\log_{2001} x)) > 2004^0 = 1$, so $\\log_{2002} (\\log_{2001} x) > 2003^1 = 2003$, which means $\\log_{2001} x > 2002^{2003}$, so $x > 2001^{2002^{2003}}$.\n\nTherefore, the set of all real numbers $x$ for which the expression is defined is $\\{x \\mid x > 2001^{2002^{2003}}\\}$.\nThus $c = 2001^{2002^{2003}}$.\n\

51.19246861924687

In [6]:
fname = BASEDIR / "outputs/short_answer/gemini-1.5-flash-002/outputs.jsonl"

raw_responses_zeroshot = read_jsonl(fname)
responses_zeroshot = [
    {
        "uid": x["uid"],
        "system": x["system"],
        "prompt": x["prompt"],
        "response": ModelResponse.from_response(x["response"], "google")
    }
    for x in raw_responses_zeroshot
    if x["uid"] in dataset_map
]
raw_response_zeroshot_map = {o["uid"]: raw for raw, o in zip(raw_responses, responses)}
response_zeroshot_map = {o["uid"]: o for o in responses}

answer_check_dicts_zeroshot = make_answer_check_dict_from_jsonl(responses_zeroshot, dataset_map)
answer_check_results_zeroshot = latex_answer_check(answer_check_dicts_zeroshot, use_tqdm=True)
zeroshot_results_metadata = make_results_df(responses_zeroshot, answer_check_results_zeroshot, dataset_map, mode="shortans")

sum([x["is_correct"] for x in answer_check_results_zeroshot]) / len(dataset) * 100

  0%|          | 0/4780 [00:00<?, ?it/s]

52.25941422594143

In [8]:
os.makedirs(BASEDIR / "results/gemini-1.5-flash-002", exist_ok=True)
results_metadata.to_csv(BASEDIR / "results/gemini-1.5-flash-002/results_minerva.csv")
zeroshot_results_metadata.to_csv(BASEDIR / "results/gemini-1.5-flash-002/results.csv")

## Cost

In [9]:
PRICES = get_pricing("gemini-1.5-flash-002")
PRICES

{'input_tokens': 1.875e-08, 'output_tokens': 7.5e-08}

In [10]:
total_cost = 0
for ex in raw_responses_zeroshot:
    input_len = 0
    for part in ex["system"]:
        input_len += count_tokens(part, "gemini-1.5-flash-002")
    for part in ex["prompt"]:
        input_len += count_tokens(part["content"], "gemini-1.5-flash-002")
    total_cost += PRICES["input_tokens"] * input_len

    output_len = 0
    for cand in ex["response"]["candidates"]:
        if "content" not in cand:
            continue
        for part in cand["content"]["parts"]:
            output_len += count_tokens(part["text"], "gemini-1.5-flash-002")
    total_cost += PRICES["output_tokens"] * output_len
total_cost / 2  # divide by 2 for batch api

0.2124494062500012

In [11]:
total_cost = 0
for ex in raw_responses:
    input_len = 0
    for part in ex["system"]:
        input_len += count_tokens(part, "gemini-1.5-flash-002")
    for part in ex["prompt"]:
        input_len += count_tokens(part["content"], "gemini-1.5-flash-002")
    total_cost += PRICES["input_tokens"] * input_len

    output_len = 0
    for cand in ex["response"]["candidates"]:
        if "content" not in cand:
            continue
        for part in cand["content"]["parts"]:
            output_len += count_tokens(part["text"], "gemini-1.5-flash-002")
    total_cost += PRICES["output_tokens"] * output_len
total_cost / 2  # divide by 2 for batch api

0.24321309375000175

## Finish reason

- 3 problem was stopped for "copyright".
    - 1 is from "The Contest Problem Book IV". This seems to indicate that these books are in the Gemini training data, which means data contamination! TODO: see if the models do better at these problems? will be some confounders
    - 1 is from a generic sounding trig book. Seems reasonable because the problem seems like a basic-ish trig identity
    - 1 is from this blogpost about a problem in the Hauselbauer-Dickheiser test??? I couldn't find anything about the AMC in the wordpress, and the problem is not the same. Seems like a false positive
    - Notably, Gemini 1.5 Pro v2 has mutually exclusive copyright responses. So this is not deterministic and/or depends on model parameters/training.
- For the 7 max_length hits
    - 6 are repetition, sometimes even of a single digit in a float!
    - 1 is an inf counting issue: it starts listing all possible values

In [12]:
results_metadata.value_counts("reason")

reason
stop         4770
length          7
copyright       3
Name: count, dtype: int64

In [13]:
results_metadata.query("reason != 'stop'").value_counts(["level", "reason"]).sort_index()

level  reason   
2      copyright    1
       length       1
3      length       3
4      copyright    2
       length       3
Name: count, dtype: int64

In [14]:
results_metadata.query("reason != 'stop'").value_counts(["subject", "reason"]).sort_index()

subject        reason   
algebra        copyright    1
               length       1
number_theory  length       5
prealgebra     length       1
precalculus    copyright    2
Name: count, dtype: int64

In [15]:
results_metadata.query("reason != 'stop'")

Unnamed: 0_level_0,level,subject,mcq_only,has_asy_problem,has_asy_solution,reason,is_correct,answer,predict,input_tokens,output_tokens,reasoning_tokens
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1982/AHSME/27,4,precalculus,False,False,False,copyright,False,$-a+bi$,,800,0,0
1983/AHSME/11,2,precalculus,False,False,False,copyright,False,$\sin x$,,729,0,0
2006/AMC_8/24,2,prealgebra,False,False,False,length,False,$1$,,771,2048,0
2007/AMC_10B/24,3,number_theory,False,False,False,length,False,$4944$,,765,2048,0
2014/AMC_12A/23,4,number_theory,False,False,False,length,False,$883$,,784,2048,0
2015/AIME_I/8,4,number_theory,False,False,False,length,False,$695$,,753,2048,0
2016/AMC_12B/25,4,algebra,False,False,False,copyright,False,$17$,,794,0,0
2017/AMC_10B/25,3,algebra,False,False,False,length,False,$100$,,775,2048,0
2020/AIME_II/5,3,number_theory,False,False,False,length,False,$151$,,883,2048,0
2021/AMC_12A/25,4,number_theory,False,False,False,length,False,$9$,,826,2048,0


In [16]:
ex_uid = "2021/AMC_12A/25"
display(Markdown(response_map[ex_uid]["response"].completions[0].completion))

Let $f(n) = \frac{d(n)}{n^{1/3}}$. We want to find the unique positive integer $N$ such that $f(N) > f(n)$ for all positive integers $n \ne N$.

We have $d(1) = 1$, $d(2) = 2$, $d(3) = 2$, $d(4) = 3$, $d(5) = 2$, $d(6) = 4$, $d(7) = 2$, $d(8) = 4$, $d(9) = 3$, $d(10) = 4$, $d(11) = 2$, $d(12) = 6$.

Then $f(1) = 1$, $f(2) \approx 1.587$, $f(3) \approx 1.260$, $f(4) \approx 1.189$, $f(5) \approx 0.894$, $f(6) \approx 1.633$, $f(7) \approx 0.756$, $f(8) \approx 1.189$, $f(9) \approx 1.054$, $f(10) \approx 1.260$, $f(11) \approx 0.715$, $f(12) \approx 1.732$.

It appears that $f(n)$ is maximized when $n$ is highly composite.  Let's consider highly composite numbers.  The highly composite numbers are 1, 2, 4, 6, 12, 24, 36, 48, 60, 120, 180, 240, 360, 720, 840, 1260, 1680, 2520, 5040, 7560, 10080, 15120, 20160, 25200, 27720, 45360, 50400, 55440, 83160, 110880, 166320, 221760, 277200, 332640, 498960, 554400, 665280, 720720, 1081080, 1441440, 2162160, 2882880, 3603600, 4324320, 6486480, 7207200, 8648640, 10810800, 12972960, 17297280, 21621600, 25945920, 34594560, 43243200, 51891840, 69189120, 86486400, 103783680, 138378240, 172972800, 207567360, 276756480, 345945600, 415134720, 553512960, 691891200, 830269440, 1107025920, 1383782400, 1660538880, 2214051840, 2767564800, 3321077760, 4428103680, 5535129600, 6642155520, 8856207360, 11070259200, 13284311040, 17712414720, 22140518400, 26568622080, 35424829440, 44281036800, 53137244160, 70849658880, 88562073600, 106274488320, 141699317760, 177124147200, 212548976640, 283398635520, 354248294400, 425097953280, 566797271040, 708496588800, 850195906560, 1133594542080, 1416993177600, 1700391813120, 2267189084160, 2833986355200, 3400783626240, 4534378168320, 5667972710400, 6801567252480, 9068756336640, 11335945420800, 13603134504960, 18137512673280, 22671890841600, 27206269009920, 36275025346560, 45343781683200, 54412538019840, 72550050693120, 90687563366400, 108825076039680, 145100101386240, 181375126732800, 217650152079360, 290200202772480, 362750253465600, 435300304158720, 580400405544960, 725500506931200, 870600608317440, 1160800811089920, 1451001013862400, 1741201216634880, 2321601622179840, 2902002027724800, 3482402433269760, 4643203244359680, 5804004055449600, 6964804866539520, 9286406488719360, 11608008110899200, 13929609733079040, 18572812977438720, 23216016221798400, 27859219466158080, 371456259548774

In [17]:
raw_response_map["2016/AMC_12B/25"]["prompt"][-2]

{'role': 'user',
 'content': 'Problem:\nThe sequence $(a_n)$ is defined recursively by $a_0=1$, $a_1=\\sqrt[19]{2}$, and $a_n=a_{n-1}a_{n-2}^2$ for $n\\geq 2$. What is the smallest positive integer $k$ such that the product $a_1a_2\\cdots a_k$ is an integer?'}

In [18]:
raw_response_map["2016/AMC_12B/25"]["response"]

{'candidates': [{'finish_reason': 'RECITATION',
   'citation_metadata': {'citations': [{'start_index': 746,
      'end_index': 951,
      'uri': 'https://dkalemis.wordpress.com/2018/11/10/my-attempt-at-question-2-from-the-haselbauer-dickheiser-test/'}]},
   'avg_logprobs': 'NaN'}],
 'usage_metadata': {'prompt_token_count': 794, 'total_token_count': 794},
 'model_version': 'gemini-1.5-flash-002',
 'model': 'gemini-1.5-flash-002'}

In [19]:
response_map["2001/AMC_12/16"]["response"]

ModelResponse(completions=[ModelCompletion(index=0, completion="For each leg, the spider must put on the sock before the shoe.  There are 8 legs. For each leg, there are 2 items to put on.  Thus, there are $2!$ ways to put on the sock and shoe for each leg. Since there are 8 legs, there are $(2!)^8$ ways to put on the socks and shoes.  However, we must consider the order in which the spider puts on the socks and shoes for each leg.  For each leg, there are 2 items to put on, so there are 2 choices for the first item and 1 choice for the second item.  This gives $2! = 2$ ways to put on the socks and shoes for each leg. Since there are 8 legs, there are $(2!)^8 = 2^8 = 256$ ways to put on the socks and shoes.\n\nThere are 16 items in total. If there were no restrictions, there would be 16! ways to put on the items. However, for each leg, the sock must be put on before the shoe.  Let's consider one leg. There are 2! ways to put on the sock and shoe for that leg. Since there are 8 legs, th

## Accuracy

In [20]:
accuracy_by_split(results_metadata, "level")

Unnamed: 0_level_0,count,accuracy
level,Unnamed: 1_level_1,Unnamed: 2_level_1
1,799,72.715895
2,1505,64.451827
3,1363,46.661775
4,719,30.876217
5,197,13.19797
6,197,6.091371


In [21]:
accuracy_by_split(results_metadata, "subject")

Unnamed: 0_level_0,count,accuracy
subject,Unnamed: 1_level_1,Unnamed: 2_level_1
algebra,970,60.309278
counting_and_probability,812,42.118227
geometry,1268,38.40694
number_theory,601,49.251248
prealgebra,889,73.790776
precalculus,240,33.75


In [22]:
accuracy_by_split(results_metadata, ["level", "subject"])

Unnamed: 0_level_0,Unnamed: 1_level_0,count,accuracy
level,subject,Unnamed: 2_level_1,Unnamed: 3_level_1
1,algebra,33,72.727273
1,counting_and_probability,126,67.460317
1,geometry,148,60.135135
1,number_theory,84,77.380952
1,prealgebra,408,77.941176
2,algebra,309,77.993528
2,counting_and_probability,222,55.855856
2,geometry,398,50.502513
2,number_theory,172,64.534884
2,prealgebra,388,73.195876


In [23]:
accuracy_by_split(zeroshot_results_metadata, "level")

Unnamed: 0_level_0,count,accuracy
level,Unnamed: 1_level_1,Unnamed: 2_level_1
1,799,71.964956
2,1505,65.448505
3,1363,48.422597
4,719,31.849791
5,197,15.228426
6,197,9.64467


In [24]:
accuracy_by_split(zeroshot_results_metadata, "subject")

Unnamed: 0_level_0,count,accuracy
subject,Unnamed: 1_level_1,Unnamed: 2_level_1
algebra,970,60.0
counting_and_probability,812,45.073892
geometry,1268,39.66877
number_theory,601,51.414309
prealgebra,889,73.453318
precalculus,240,35.416667


## Look at some random outputs

In [26]:
wrong_answers = [
    {
        "uid": answer_check_dicts[i]["uid"],
        "level": dataset_map[answer_check_dicts[i]["uid"]]["level"],
        "subject": dataset_map[answer_check_dicts[i]["uid"]]["subject"],
        "problem": answer_check_dicts[i]["problem"],
        **x
    } 
    for i, x in enumerate(answer_check_results) 
    if not x["is_correct"] and responses[i]["response"].completions[0].finish_reason == "stop"
]

In [27]:
len(wrong_answers)

2323

In [28]:
rand_idx = np.random.randint(len(wrong_answers))
wrong_answers[rand_idx]

{'uid': '2003/AMC_12A/12',
 'level': 3,
 'subject': 'number_theory',
 'problem': 'Sally has five red cards numbered $1$ through $5$ and four blue cards numbered $3$ through $6$. She stacks the cards so that the colors alternate and so that the number on each red card divides evenly into the number on each neighboring blue card. What is the sum of the numbers on the middle three cards?',
 'finish_reason': <FinishReason.STOP: 'stop'>,
 'generated_text': "Let R denote a red card and B denote a blue card.  The cards must alternate in color, so the arrangement must be RBRBRB or BRBRBR.\n\nThe numbers on the red cards are 1, 2, 3, 4, 5. The numbers on the blue cards are 3, 4, 5, 6.\n\nIf the arrangement is RBRBRB, then the first red card must divide the first blue card.  The only possibility is that the first red card is 1.  Then the next blue card must be divisible by 2, so it must be 4 or 6.  If it is 4, then the next red card must be 1, 2, or 4.  If it is 6, then the next red card must be