# Gemini 1.5 Pro Multiple Choice

In [1]:
import sys
from pathlib import Path

BASEDIR = Path("/workspaces/HARP/") / "src"  # Replace with your own basedir path for the repo

sys.path.insert(0, str(BASEDIR))

In [2]:
from __future__ import annotations

import copy
import itertools
import json
import math
import os
import pickle
import pprint
import re
import textwrap
import time
import traceback
from collections import Counter, defaultdict
from typing import Any

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tiktoken
from IPython.display import Markdown, clear_output, display
from tqdm.auto import tqdm

In [3]:
from eval.api import safe_unified_api_call
from eval.costs import count_tokens, get_pricing
from eval.eval import run_one, create_batch, make_answer_check_dict_from_jsonl, make_results_df, accuracy_by_split
from eval.parsing_lib import *
from eval.latex_answer_check import *
from eval.prompt import create_prompt
from eval.prompts import *
from eval.response import ModelResponse
from eval.utils import read_jsonl, write_jsonl, get_uid

# Data

In [4]:
# We don't use HARP_mcq.jsonl here because we used the original ordering of answer choices
dataset = [x for x in read_jsonl(BASEDIR / "data/processed/HARP_raw.jsonl") if x["choices"] is not None and x["subject"] != "calculus"]
dataset_map = {get_uid(p): p for p in dataset}
len(dataset)

4110

In [5]:
all([p["answer_choice"] is not None for p in dataset])

True

In [6]:
Counter([p["level"] for p in dataset])

Counter({2: 1612, 3: 1136, 1: 858, 4: 504})

In [7]:
Counter([p["answer_choice"] for p in dataset])

Counter({'D': 991, 'C': 934, 'B': 929, 'E': 641, 'A': 615})

In [8]:
{c: x / len(dataset) for c, x in Counter([p["answer_choice"] for p in dataset]).items()}

{'C': 0.2272506082725061,
 'D': 0.2411192214111922,
 'E': 0.1559610705596107,
 'A': 0.14963503649635038,
 'B': 0.22603406326034065}

In [9]:
sum((x / len(dataset))**2 for c, x in Counter([p["answer_choice"] for p in dataset]).items())

0.2075872153255072

# Results

In [10]:
fname_fromtext = BASEDIR / "outputs/mcq/gemini-1.5-pro-002/outputs_from-text.jsonl"

raw_responses_fromtext = read_jsonl(fname_fromtext)
responses_fromtext = [
    {
        "uid": x["custom_id"],
        "system": x["request"]["system_instruction"],
        "prompt": x["request"]["contents"],
        "response": ModelResponse.from_response(x["response"], "google", use_batch_api=True)
    }
    for x in raw_responses_fromtext
    if x["custom_id"] in dataset_map
]
raw_response_fromtext_map = {x["custom_id"]: x for x in raw_responses_fromtext if x["custom_id"] in dataset_map}
response_fromtext_map = {o["uid"]: o for o in responses_fromtext}

answer_check_dicts_fromtext = make_answer_check_dict_from_jsonl(responses_fromtext, dataset_map)
answer_check_results_fromtext = latex_answer_choice_check(answer_check_dicts_fromtext, use_tqdm=True)
results_metadata_fromtext = make_results_df(responses_fromtext, answer_check_results_fromtext, dataset_map, mode="mcq")

sum([x["is_correct"] for x in answer_check_results_fromtext]) / len(dataset) * 100

  0%|          | 0/4110 [00:00<?, ?it/s]

80.41362530413625

In [11]:
fname_newlinedot = BASEDIR / "outputs/mcq/gemini-1.5-pro-002/outputs_newline-dot.jsonl"

raw_responses_newlinedot = read_jsonl(fname_newlinedot)
responses_newlinedot = [
    {
        "uid": x["custom_id"],
        "system": x["request"]["system_instruction"],
        "prompt": x["request"]["contents"],
        "response": ModelResponse.from_response(x["response"], "google", use_batch_api=True)
    }
    for x in raw_responses_newlinedot
    if x["custom_id"] in dataset_map
]
raw_response_newlinedot_map = {x["custom_id"]: x for x in raw_responses_newlinedot if x["custom_id"] in dataset_map}
response_newlinedot_map = {o["uid"]: o for o in responses_newlinedot}

answer_check_dicts_newlinedot = make_answer_check_dict_from_jsonl(responses_newlinedot, dataset_map)
answer_check_results_newlinedot = latex_answer_choice_check(answer_check_dicts_newlinedot, use_tqdm=True)
results_metadata_newlinedot = make_results_df(responses_newlinedot, answer_check_results_newlinedot, dataset_map, mode="mcq")

sum([x["is_correct"] for x in answer_check_results_newlinedot]) / len(dataset) * 100

  0%|          | 0/4110 [00:00<?, ?it/s]

80.63260340632603

In [12]:
fname_newlineparen = BASEDIR / "outputs/mcq/gemini-1.5-pro-002/outputs_newline-paren.jsonl"

raw_responses_newlineparen = read_jsonl(fname_newlineparen)
responses_newlineparen = [
    {
        "uid": x["custom_id"],
        "system": x["request"]["system_instruction"],
        "prompt": x["request"]["contents"],
        "response": ModelResponse.from_response(x["response"], "google", use_batch_api=True)
    }
    for x in raw_responses_newlineparen
    if x["custom_id"] in dataset_map
]
raw_response_newlineparen_map = {x["custom_id"]: x for x in raw_responses_newlineparen if x["custom_id"] in dataset_map}
response_newlineparen_map = {o["uid"]: o for o in responses_newlineparen}

answer_check_dicts_newlineparen = make_answer_check_dict_from_jsonl(responses_newlineparen, dataset_map)
answer_check_results_newlineparen = latex_answer_choice_check(answer_check_dicts_newlineparen, use_tqdm=True)
results_metadata_newlineparen = make_results_df(responses_newlineparen, answer_check_results_newlineparen, dataset_map, mode="mcq")

sum([x["is_correct"] for x in answer_check_results_newlineparen]) / len(dataset) * 100

  0%|          | 0/4110 [00:00<?, ?it/s]

81.0948905109489

In [62]:
os.makedirs(BASEDIR / "results/gemini-1.5-pro-002", exist_ok=True)
results_metadata_fromtext.to_csv(BASEDIR / "results/gemini-1.5-pro-002/results_mcq_fromtext.csv")
results_metadata_newlinedot.to_csv(BASEDIR / "results/gemini-1.5-pro-002/results_mcq_newlinedot.csv")
results_metadata_newlineparen.to_csv(BASEDIR / "results/gemini-1.5-pro-002/results_mcq_newlineparen.csv")

## Finish reason

In [13]:
results_metadata_newlineparen.value_counts("reason")

reason
stop         4093
length         15
copyright       2
Name: count, dtype: int64

In [14]:
results_metadata_newlineparen.query("reason != 'stop'").value_counts(["level", "reason"]).sort_index()

level  reason   
2      copyright    2
       length       7
3      length       4
4      length       4
Name: count, dtype: int64

In [15]:
results_metadata_newlineparen.query("reason != 'stop'").value_counts(["subject", "reason"]).sort_index()

subject                   reason   
algebra                   length       3
counting_and_probability  length       5
geometry                  length       3
number_theory             copyright    1
                          length       4
prealgebra                copyright    1
Name: count, dtype: int64

In [16]:
results_metadata_newlineparen.query("reason != 'stop'")

Unnamed: 0_level_0,level,subject,mcq_only,has_asy_problem,has_asy_solution,reason,is_correct,answer,predict,input_tokens,output_tokens,reasoning_tokens
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2007/AMC_10B/24,3,number_theory,False,False,False,length,False,C,,175,2048,0
2021/AMC_10A/20,2,counting_and_probability,False,False,False,length,False,D,,142,2048,0
2011/AMC_12A/22,4,counting_and_probability,False,False,False,length,False,C,,206,2048,0
2004/AMC_12B/22,4,algebra,False,False,False,length,False,C,,228,2048,0
2017/AMC_12A/6,2,counting_and_probability,False,False,False,length,False,B,,208,2048,0
2022/AMC_10A/13,2,geometry,False,False,False,length,False,C,,188,2048,0
2022/AMC_10B/18,2,counting_and_probability,False,False,False,length,False,B,,338,2048,0
1993/AHSME/7,2,number_theory,False,False,False,length,False,E,,211,2048,0
2018/AMC_12B/14,3,number_theory,False,False,False,length,False,E,,192,2048,0
2020/AMC_10B/15,2,number_theory,False,False,False,length,False,D,,286,2048,0


In [17]:
raw_response_newlineparen_map["1958/AHSME/4"]["request"]["contents"][-2]

{'parts': [{'text': 'Problem:\nIn the expression $\\frac{x + 1}{x - 1}$ each $x$ is replaced by $\\frac{x + 1}{x - 1}$. The resulting expression, evaluated for $x = \\frac{1}{2}$, equals:\n(A) $3$\n(B) $-3$\n(C) $1$\n(D) $-1$\n(E) $\\text{none of these}$'}],
 'role': 'user'}

In [19]:
raw_response_newlineparen_map["1958/AHSME/4"]["response"]

{'candidates': [{'citationMetadata': {'citations': [{'endIndex': 428,
      'startIndex': 213,
      'uri': 'https://www.coursehero.com/textbook-solutions/verify-that-the-functions-are-inverses-by-showing-that-f-g-and-g-f-are-the-identity-9781305652231-1314/Chapter-3-Problem-36-2231764/'}]},
   'finishReason': 'RECITATION'}],
 'modelVersion': 'gemini-1.5-pro-002@default',
 'usageMetadata': {'promptTokenCount': 159, 'totalTokenCount': 159}}

## Accuracy

In [20]:
accuracy_by_split(results_metadata_newlineparen, "level")

Unnamed: 0_level_0,count,accuracy
level,Unnamed: 1_level_1,Unnamed: 2_level_1
1,858,87.645688
2,1612,86.228288
3,1136,76.232394
4,504,64.484127


In [21]:
accuracy_by_split(results_metadata_newlineparen, "subject")

Unnamed: 0_level_0,count,accuracy
subject,Unnamed: 1_level_1,Unnamed: 2_level_1
algebra,872,87.155963
counting_and_probability,600,71.0
geometry,1071,73.669468
number_theory,473,79.915433
prealgebra,957,92.163009
precalculus,137,71.532847


## Look at some outputs

In [29]:
fname_zeroshot = BASEDIR / "outputs/short_answer/gemini-1.5-pro-002/outputs.jsonl"

raw_responses_zeroshot = read_jsonl(fname_zeroshot)
responses_zeroshot = [
    {
        "uid": x["custom_id"],
        "system": x["request"]["system_instruction"],
        "prompt": x["request"]["contents"],
        "response": ModelResponse.from_response(x["response"], "google", use_batch_api=True)
    }
    for x in raw_responses_zeroshot
    if x["custom_id"] in dataset_map and x["custom_id"] != "1950/AHSME/44"
]
raw_response_zeroshot_map = {x["custom_id"]: x for x in raw_responses_zeroshot if x["custom_id"] in dataset_map}
response_zeroshot_map = {o["uid"]: o for o in responses_zeroshot}

answer_check_dicts_zeroshot = make_answer_check_dict_from_jsonl(responses_zeroshot, dataset_map)
answer_check_results_zeroshot = latex_answer_check(answer_check_dicts_zeroshot, use_tqdm=True)
zeroshot_results_metadata = make_results_df(responses_zeroshot, answer_check_results_zeroshot, dataset_map, mode="shortans")

  0%|          | 0/3797 [00:00<?, ?it/s]

In [22]:
shortans_results = pd.read_csv(BASEDIR / "results/gemini-1.5-pro-002/results.csv").set_index("uid")

In [23]:
df = pd.concat([
    results_metadata_newlineparen.set_index(["level", "subject"], append=True).assign(mode="multiple_choice"),
    shortans_results.set_index(["level", "subject"], append=True).assign(mode="short_answer"),
]).set_index("mode", append=True)

In [24]:
df = pd.merge(
    results_metadata_newlineparen.set_index(["level", "subject"], append=True),
    shortans_results.set_index(["level", "subject"], append=True),
    left_index=True,
    right_index=True,
    suffixes=("_mc", ""),
).drop(columns=["has_asy_problem_mc", "has_asy_solution_mc"])
print(len(df))
df.head()

3797


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mcq_only_mc,reason_mc,is_correct_mc,answer_mc,predict_mc,input_tokens_mc,output_tokens_mc,reasoning_tokens_mc,mcq_only,has_asy_problem,has_asy_solution,reason,is_correct,answer,predict,input_tokens,output_tokens,reasoning_tokens
uid,level,subject,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2024/AMC_8/12,1,algebra,False,stop,True,E,e,200,315,0,False,False,False,stop,True,$26$,26,152,243,0
2012/AMC_8/3,1,prealgebra,False,stop,True,B,b,244,308,0,False,False,False,stop,False,$\hspace{.05in}5:21\textsc{pm}$,\boxed{5:21\text{pm}},136,192,0
2005/AMC_12B/7,2,geometry,False,stop,True,D,d,125,499,0,False,False,True,stop,True,$24$,24,78,619,0
2022/AMC_12B/25,4,geometry,False,stop,False,B,c,444,674,0,False,True,True,stop,False,$-4$,9,398,479,0
2023/AMC_10B/15,2,number_theory,False,stop,False,C,d,147,978,0,False,False,False,stop,False,$70$,2145,92,555,0


In [39]:
df.loc["2021_Fall/AMC_12A/23"]

Unnamed: 0_level_0,Unnamed: 1_level_0,mcq_only_mc,reason_mc,is_correct_mc,answer_mc,predict_mc,input_tokens_mc,output_tokens_mc,reasoning_tokens_mc,mcq_only,has_asy_problem,has_asy_solution,reason,is_correct,answer,predict,input_tokens,output_tokens,reasoning_tokens
level,subject,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
4,algebra,False,stop,True,A,a,199,902,0,False,False,False,stop,False,$\dfrac{5}{16}$,1,139,1021,0


In [25]:
df.value_counts(["has_asy_problem", "is_correct"])

has_asy_problem  is_correct
False            True          2195
                 False         1038
True             False          320
                 True           244
Name: count, dtype: int64

In [26]:
df.value_counts(["has_asy_problem", "is_correct_mc"])

has_asy_problem  is_correct_mc
False            True             2710
                 False             523
True             True              355
                 False             209
Name: count, dtype: int64

In [30]:
rand_uid = np.random.choice(df.query("is_correct_mc and not is_correct and reason_mc == 'stop' and reason == 'stop'").index)[0]
print(rand_uid)
print(dataset_map[rand_uid]["subject"], dataset_map[rand_uid]["level"])
print(dataset_map[rand_uid]["answer_choice"])
print(dataset_map[rand_uid]["answer"])
print(dataset_map[rand_uid]["choices"])
print(response_newlineparen_map[rand_uid]["prompt"][-2]["parts"][0]["text"])
display(Markdown(response_newlineparen_map[rand_uid]["response"].completions[0].completion))
print("----------")
display(Markdown(response_zeroshot_map[rand_uid]["response"].completions[0].completion))

2021_Fall/AMC_12A/23
algebra 4
A
$\dfrac{5}{16}$
{'A': '$\\dfrac{5}{16}$', 'B': '$\\dfrac{1}{2}$', 'C': '$\\dfrac{5}{8}$', 'D': '$1$', 'E': '$\\dfrac{9}{8}$'}
Problem:
A quadratic polynomial with real coefficients and leading coefficient $1$ is called $\emph{disrespectful}$ if the equation $p(p(x))=0$ is satisfied by exactly three real numbers. Among all the disrespectful quadratic polynomials, there is a unique such polynomial $\tilde{p}(x)$ for which the sum of the roots is maximized. What is $\tilde{p}(1)$?
(A) $\dfrac{5}{16}$
(B) $\dfrac{1}{2}$
(C) $\dfrac{5}{8}$
(D) $1$
(E) $\dfrac{9}{8}$


Let $p(x) = x^2 + bx + c$.
Since $p(x)$ is disrespectful, $p(p(x)) = 0$ has exactly three real roots.
Let $r_1$ and $r_2$ be the roots of $p(x) = 0$.
Then $p(x) = (x-r_1)(x-r_2) = 0$.
$p(p(x)) = 0$ implies $p(x) = r_1$ or $p(x) = r_2$.
If $r_1 = r_2$, then $p(x) = r_1$ has only one solution, so $p(p(x)) = 0$ has only one solution.
If $r_1 \neq r_2$, then $p(x) = r_1$ has two solutions and $p(x) = r_2$ has two solutions.
For $p(p(x)) = 0$ to have exactly three solutions, one of $p(x) = r_1$ or $p(x) = r_2$ must have exactly one solution.
This means one of $r_1$ or $r_2$ must be the vertex of $p(x)$.
The vertex of $p(x) = x^2 + bx + c$ is at $x = -\frac{b}{2}$, and the value is $p(-\frac{b}{2}) = \frac{b^2}{4} - \frac{b^2}{2} + c = c - \frac{b^2}{4}$.
So, we have $r_1 = -\frac{b}{2}$ and $r_2 = c - \frac{b^2}{4}$.
Then $r_1 + r_2 = -\frac{b}{2} + c - \frac{b^2}{4}$.
We want to maximize $r_1 + r_2$.
Since $r_1$ is a double root, we have $p(x) = (x+\frac{b}{2})^2 = x^2 + bx + \frac{b^2}{4}$.
So $c = \frac{b^2}{4}$.
Then $r_2 = c - \frac{b^2}{4} = 0$.
$r_1 = -\frac{b}{2}$.
$r_1 + r_2 = -\frac{b}{2}$.
We want to maximize this sum, so we want to minimize $b$.
Since $p(x) = r_2$ has one solution, $r_2$ is the vertex of $p(x)$.
$p(x) = x^2 + bx + \frac{b^2}{4}$.
$p(x) = 0$ has roots $x = -\frac{b}{2}$.
$p(x) = r_2 = 0$ has one solution $x = -\frac{b}{2}$.
$p(x) = r_1$ has two solutions.
$r_1 = -\frac{b}{2}$.
$p(x) = -\frac{b}{2}$.
$x^2 + bx + \frac{b^2}{4} = -\frac{b}{2}$.
$x^2 + bx + \frac{b^2}{4} + \frac{b}{2} = 0$.
For this to have two solutions, $b^2 - 4(\frac{b^2}{4} + \frac{b}{2}) > 0$.
$-2b > 0$, so $b < 0$.
$r_1 + r_2 = -\frac{b}{2}$.
We want to maximize this, so we want to minimize $b$.
$b = -2$.
$p(x) = x^2 - 2x + 1 = (x-1)^2$.
$p(1) = 0$.
Final Answer: The final answer is A. 

----------


Let $p(x) = x^2 + bx + c$.
Since $p(x)$ is disrespectful, $p(p(x)) = 0$ has exactly three real roots.
Let $r_1$ and $r_2$ be the roots of $p(x) = 0$.
Then $p(x) = (x-r_1)(x-r_2) = 0$.
$p(p(x)) = 0$ implies $p(x) = r_1$ or $p(x) = r_2$.
If $r_1 = r_2$, then $p(x) = r_1$ has only one solution, so $p(p(x)) = 0$ has only one solution, which is not allowed.
If $r_1 \neq r_2$, then $p(x) = r_1$ has two solutions and $p(x) = r_2$ has two solutions.
If the two equations have a common root, then $p(x) = r_1$ and $p(x) = r_2$ have a common root $x_0$.
Then $r_1 = p(x_0) = r_2$, which is a contradiction.
So, we must have one of the equations having two identical roots, and the other having two distinct roots.
Without loss of generality, let $r_1$ be the repeated root.
Then $p(x) = r_1$ has a repeated root, so $x^2 + bx + c - r_1 = 0$ has a repeated root.
The discriminant is $b^2 - 4(c-r_1) = 0$, so $r_1 = c - \frac{b^2}{4}$.
Also, $r_1 + r_2 = -b$ and $r_1 r_2 = c$.
So $r_2 = -b - r_1 = -b - c + \frac{b^2}{4}$.
$c = r_1 r_2 = (c - \frac{b^2}{4})(-b - c + \frac{b^2}{4})$.
$p(x) = r_2$ has two distinct roots.
$x^2 + bx + c - r_2 = 0$ has two distinct roots.
$b^2 - 4(c-r_2) > 0$.
$r_1 = c - \frac{b^2}{4}$.
$r_1 + r_2 = -b$.
$r_1 = -b - r_2$.
$p(x) = x^2 + bx + c$.
$p(x) = r_1$ has a double root.
$x^2 + bx + c - r_1 = 0$.
$b^2 - 4(c-r_1) = 0$.
$r_1 = c - \frac{b^2}{4}$.
$r_1 + r_2 = -b$.
$r_2 = -b - r_1 = -b - c + \frac{b^2}{4}$.
$r_1 = -\frac{b}{2}$.
$c - \frac{b^2}{4} = -\frac{b}{2}$.
$c = \frac{b^2}{4} - \frac{b}{2}$.
$r_2 = -b - (-\frac{b}{2}) = -\frac{b}{2}$.
$r_1 = r_2$.
$p(x) = x^2 + bx + \frac{b^2-2b}{4}$.
$p(x) = -\frac{b}{2}$.
$x^2 + bx + \frac{b^2-2b}{4} + \frac{b}{2} = x^2 + bx + \frac{b^2}{4} = (x+\frac{b}{2})^2 = 0$.
$x = -\frac{b}{2}$.
$r_1 = r_2 = -\frac{b}{2}$.
$p(1) = 1 + b + \frac{b^2-2b}{4} = \frac{b^2+2b+4}{4}$.
$b=-2$. $p(x) = x^2 - 2x + 1 = (x-1)^2$.
$p(1) = 1$.

Final Answer: The final answer is $1$. 

In [38]:
display(Markdown(response_contam_map[rand_uid]["response"].completions[0].completion))

We are asked to find the product of 200,000 and 200,000.
We can write 200,000 as $2 \times 10^5$.
So, $200,000 \times 200,000 = (2 \times 10^5) \times (2 \times 10^5)$.
Using the commutative and associative properties of multiplication, we can rewrite this as $(2 \times 2) \times (10^5 \times 10^5)$.
$2 \times 2 = 4$.
$10^5 \times 10^5 = 10^{5+5} = 10^{10}$.
So, $200,000 \times 200,000 = 4 \times 10^{10}$.
$4 \times 10^{10} = 40,000,000,000$.

Final Answer: The final answer is A. 

In [32]:
# True positive
# 2002/AMC_8/6 -- short answer gen misreads the asy diagrams?
# A lot of the time, the short answer mode comes up with an answer that isn't an answer choice
# I wonder if the choices make it easier, make generations where one of the choices appear at the end more likely, making problems easier
rand_uid = "1971/AHSME/4" # other examples: 2015/AMC_12A/15, 1999/AHSME/4, 2006/AMC_8/17, 2002/AMC_8/6, 1968/AHSME/18, 2023/AMC_12B/25
print(rand_uid)
print(dataset_map[rand_uid]["subject"], dataset_map[rand_uid]["level"])
print(dataset_map[rand_uid]["answer_choice"])
print(dataset_map[rand_uid]["answer"])
print(dataset_map[rand_uid]["choices"])
print(response_newlineparen_map[rand_uid]["prompt"][-2]["parts"][0]["text"])
display(Markdown(response_newlineparen_map[rand_uid]["response"].completions[0].completion))
print("----------")
display(Markdown(response_zeroshot_map[rand_uid]["response"].completions[0].completion))

1971/AHSME/4
prealgebra 2
A
$11$
{'A': '$11$', 'B': '$12$', 'C': '$13$', 'D': '$21$', 'E': '$31$'}
Problem:
After simple interest for two months at $5$% per annum was credited, a Boy Scout Troop had a total of $\textdollar 255.31$ in the Council Treasury. The interest credited was a number of dollars plus the following number of cents
(A) $11$
(B) $12$
(C) $13$
(D) $21$
(E) $31$


Let $P$ be the principal amount in dollars.
The interest rate is $5\%$ per annum, which is equivalent to $\frac{5}{100} = 0.05$.
The time period is two months, which is $\frac{2}{12} = \frac{1}{6}$ years.
The simple interest is given by $I = P \times r \times t$, where $P$ is the principal, $r$ is the rate, and $t$ is the time.
The total amount after two months is $A = P + I = P + P \times r \times t = P(1 + rt)$.
We are given that the total amount is $255.31$.
So, $255.31 = P(1 + 0.05 \times \frac{1}{6}) = P(1 + \frac{0.05}{6}) = P(1 + \frac{5}{600}) = P(1 + \frac{1}{120}) = P(\frac{121}{120})$.
Therefore, $P = 255.31 \times \frac{120}{121} = \frac{25531}{100} \times \frac{120}{121} = \frac{3063720}{12100} = 253.2$.
The interest credited is $I = 255.31 - 253.2 = 2.11$.
The interest credited is $2$ dollars and $11$ cents.

Final Answer: The final answer is A. 

----------


Let $P$ be the principal amount in dollars.
Let $r$ be the annual interest rate, which is $5\% = 0.05$.
Let $t$ be the time in years. Since the interest is for two months, $t = \frac{2}{12} = \frac{1}{6}$ years.

The simple interest formula is $I = Prt$.
The total amount after interest is credited is $A = P + I = P + Prt = P(1 + rt)$.

We are given that the total amount after two months is $255.31. So,
$A = 255.31$
$P(1 + rt) = 255.31$
$P(1 + 0.05 \times \frac{1}{6}) = 255.31$
$P(1 + \frac{0.05}{6}) = 255.31$
$P(1 + \frac{5}{600}) = 255.31$
$P(1 + \frac{1}{120}) = 255.31$
$P(\frac{120 + 1}{120}) = 255.31$
$P(\frac{121}{120}) = 255.31$
$P = 255.31 \times \frac{120}{121}$
$P = \frac{30637.2}{121}$
$P = 253.2$

The interest credited is $I = A - P = 255.31 - 253.2 = 2.11$.
The interest credited is $2.11.

Final Answer: The final answer is $2.11. 

In [33]:
# False positive
rand_uid = "2022/AMC_10A/5"  # others: 1991/AHSME/23
print(rand_uid)
print(dataset_map[rand_uid]["subject"], dataset_map[rand_uid]["level"])
print(dataset_map[rand_uid]["answer_choice"])
print(dataset_map[rand_uid]["answer"])
print(dataset_map[rand_uid]["choices"])
print(response_newlineparen_map[rand_uid]["prompt"][-2]["parts"][0]["text"])
display(Markdown(response_newlineparen_map[rand_uid]["response"].completions[0].completion))
print("----------")
display(Markdown(response_zeroshot_map[rand_uid]["response"].completions[0].completion))

2022/AMC_10A/5
geometry 1
C
$2 - \sqrt{2}$
{'A': '$\\frac{\\sqrt{2}}{3}$', 'B': '$\\frac{1}{2}$', 'C': '$2 - \\sqrt{2}$', 'D': '$1 - \\frac{\\sqrt{2}}{4}$', 'E': '$\\frac{2}{3}$'}
Problem:
Square $ABCD$ has side length $1$. Points $P$, $Q$, $R$, and $S$ each lie on a side of $ABCD$ such that $APQCRS$ is an equilateral convex hexagon with side length $s$. What is $s$?
(A) $\frac{\sqrt{2}}{3}$
(B) $\frac{1}{2}$
(C) $2 - \sqrt{2}$
(D) $1 - \frac{\sqrt{2}}{4}$
(E) $\frac{2}{3}$


Let $x = AP = PB = BQ = QC = CR = RD = DS = SA$.
Since the side length of the square is 1, we have $x + s + x = 1$, so $2x + s = 1$.
Since $APQCRS$ is an equilateral convex hexagon with side length $s$, we have $AP = PB = BQ = QC = CR = RD = DS = SA = x$.
Consider the right triangle formed by $AP$ and $PQ$. Since $AP = x$ and $PQ = s$, we have $x^2 + s^2 = (AQ)^2$.
Also, consider the right triangle formed by $AB$ and $BQ$. Since $AB = 1$ and $BQ = x$, we have $1^2 + x^2 = (AQ)^2$.
Thus, $x^2 + s^2 = 1 + x^2$, which implies $s^2 = 1$. Since $s$ is a length, $s > 0$, so $s = 1$.
However, $2x + s = 1$, so $2x + 1 = 1$, which means $x = 0$. This is not possible since $x$ must be positive.

Let's consider the right triangle formed by $AP$ and $PQ$. We have $AP = x$ and $PQ = s$. The angle $\angle PAQ = 45^\circ$.
The angle $\angle APQ$ is $180^\circ - 120^\circ - 45^\circ = 15^\circ$.
Using the sine rule in triangle $APQ$, we have $\frac{s}{\sin 45^\circ} = \frac{x}{\sin 15^\circ}$.
$s = \frac{x \sin 45^\circ}{\sin 15^\circ} = \frac{x \frac{\sqrt{2}}{2}}{\frac{\sqrt{6} - \sqrt{2}}{4}} = \frac{2\sqrt{2}x}{\sqrt{6} - \sqrt{2}} = \frac{2\sqrt{2}x(\sqrt{6} + \sqrt{2})}{6 - 2} = \frac{2\sqrt{12}x + 4x}{4} = \frac{4\sqrt{3}x + 4x}{4} = (\sqrt{3} + 1)x$.
Since $2x + s = 1$, we have $2x + (\sqrt{3} + 1)x = 1$, so $(3 + \sqrt{3})x = 1$.
$x = \frac{1}{3 + \sqrt{3}} = \frac{3 - \sqrt{3}}{6}$.
$s = 1 - 2x = 1 - 2\left(\frac{3 - \sqrt{3}}{6}\right) = 1 - \frac{3 - \sqrt{3}}{3} = \frac{3 - 3 + \sqrt{3}}{3} = \frac{\sqrt{3}}{3}$.
$s = 1 - 2x = 2 - \sqrt{2}$.
$2x + s = 1$, so $x = \frac{1-s}{2}$.
$x^2 + s^2 = (1-x)^2 + x^2 = 1 - 2x + 2x^2 = 1 - (1-s) + 2(\frac{1-s}{2})^2 = s + \frac{1 - 2s + s^2}{2} = \frac{2s + 1 - 2s + s^2}{2} = \frac{1+s^2}{2}$.
$s^2 = \frac{1+s^2}{2}$, so $2s^2 = 1 + s^2$, $s^2 = 1$, $s = 1$.

Final Answer: The final answer is C. 

----------


Let $AP = x$. Since $APQCRS$ is an equilateral hexagon with side length $s$, we have $AP = PQ = QC = CR = RS = SA = s$.
Thus, $x = s$.
Since $ABCD$ is a square with side length $1$, we have $AB = BC = CD = DA = 1$.
Also, $PB = 1 - AP = 1 - s$.
Since $PQ = s$, we have $BQ = \sqrt{PB^2 + PQ^2}$ by the Pythagorean theorem.
$BQ = \sqrt{(1-s)^2 + s^2} = \sqrt{1 - 2s + s^2 + s^2} = \sqrt{2s^2 - 2s + 1}$.
Since $PQ = s$, we have $QC = s$.
Since $BC = 1$, we have $BQ + QC = 1$, so $BQ = 1 - QC = 1 - s$.
Thus, we have $1 - s = \sqrt{2s^2 - 2s + 1}$.
Squaring both sides, we get $(1-s)^2 = 2s^2 - 2s + 1$, so $1 - 2s + s^2 = 2s^2 - 2s + 1$.
Then $s^2 = 0$, so $s = 0$.
However, $s$ must be positive, so we must have made a mistake.
Let $AP = x$. Then $PB = 1-x$.
Since $PQ = s$, we have $x = s$.
$BQ = \sqrt{(1-x)^2 + x^2} = \sqrt{1 - 2x + 2x^2}$.
Since $QC = x$, we have $BQ = 1 - x$.
So $1 - x = \sqrt{1 - 2x + 2x^2}$.
Squaring both sides, we get $1 - 2x + x^2 = 1 - 2x + 2x^2$, so $x^2 = 0$, which means $x = 0$.
Let $AP = BQ = CR = DS = x$ and $PQ = QR = RS = SA = PC = RA = s$.
Then $PB = 1-x$.
$s^2 = (1-x)^2 + x^2 = 1 - 2x + 2x^2$.
Also, $1 = 2x + s$.
$s = 1 - 2x$.
$s^2 = 1 - 4x + 4x^2$.
$1 - 2x + 2x^2 = 1 - 4x + 4x^2$.
$2x^2 - 2x = 0$.
$2x(x-1) = 0$.
$x = 0$ or $x = 1$.
If $x = 0$, then $s = 1$.
If $x = 1$, then $s = -1$, which is impossible.
So $s = 1$.
$1 = 2x + s$.
$1 = 2s + s = 3s$.
$s = \frac{1}{3}$.
$1 - 2s + 2s^2 = s^2$.
$s^2 - 2s + 1 = 0$.
$(s-1)^2 = 0$.
$s = 1$.
$1 = 2x + s$.
$1 = 2(\frac{1}{3}) + s$.
$1 = \frac{2}{3} + s$.
$s = \frac{1}{3}$.

Final Answer: The final answer is $\frac{1}{3}$. 

In [34]:
# False negatives
# by answer checker: 1991/AJHSME/3
# by actually needing choices: 1990/AJHSME/4
# needs reparse: 2012/AMC_8/10
rand_uid = "1991/AJHSME/3"
print(rand_uid)
print(dataset_map[rand_uid]["subject"], dataset_map[rand_uid]["level"])
print(dataset_map[rand_uid]["answer_choice"])
print(dataset_map[rand_uid]["answer"])
print(dataset_map[rand_uid]["choices"])
print(response_newlineparen_map[rand_uid]["prompt"][-2]["parts"][0]["text"])
display(Markdown(response_newlineparen_map[rand_uid]["response"].completions[0].completion))
print("----------")
display(Markdown(response_zeroshot_map[rand_uid]["response"].completions[0].completion))

1991/AJHSME/3
prealgebra 1
E
$\text{forty billion}$
{'A': '$\\text{four hundred thousand}$', 'B': '$\\text{four million}$', 'C': '$\\text{forty thousand}$', 'D': '$\\text{four hundred million}$', 'E': '$\\text{forty billion}$'}
Problem:
Two hundred thousand times two hundred thousand equals
(A) $\text{four hundred thousand}$
(B) $\text{four million}$
(C) $\text{forty thousand}$
(D) $\text{four hundred million}$
(E) $\text{forty billion}$


We are asked to find the product of $200,000$ and $200,000$.
We can write $200,000$ as $2 \times 100,000 = 2 \times 10^5$.
Then, $200,000 \times 200,000 = (2 \times 10^5) \times (2 \times 10^5)$.
We can rewrite this as $(2 \times 2) \times (10^5 \times 10^5)$.
$2 \times 2 = 4$.
$10^5 \times 10^5 = 10^{5+5} = 10^{10}$.
So, $200,000 \times 200,000 = 4 \times 10^{10}$.
$4 \times 10^{10} = 40,000,000,000$.
This number is forty billion.

Final Answer: The final answer is E. 

----------


We are asked to multiply 200,000 by 200,000.
We can write 200,000 as $2 \times 10^5$.
So, we have $(2 \times 10^5) \times (2 \times 10^5)$.
We can rewrite this as $(2 \times 2) \times (10^5 \times 10^5)$.
$2 \times 2 = 4$.
$10^5 \times 10^5 = 10^{5+5} = 10^{10}$.
So, we have $4 \times 10^{10}$.
This is equal to 4 followed by 10 zeros, which is 40,000,000,000.

Final Answer: The final answer is $40,000,000,000$. 

# Contam study -- no choices given

In [36]:
fname_contam = BASEDIR / "outputs/mcq/gemini-1.5-pro-002/outputs_none.jsonl"

raw_responses_contam = read_jsonl(fname_contam)
responses_contam = [
    {
        "uid": x["custom_id"],
        "system": x["request"]["system_instruction"],
        "prompt": x["request"]["contents"],
        "response": ModelResponse.from_response(x["response"], "google", use_batch_api=True)
    }
    for x in raw_responses_contam
    if x["custom_id"] in dataset_map
]
raw_response_contam_map = {x["custom_id"]: x for x in raw_responses_contam if x["custom_id"] in dataset_map}
response_contam_map = {o["uid"]: o for o in responses_contam}

answer_check_dicts_contam = make_answer_check_dict_from_jsonl(responses_contam, dataset_map)
answer_check_results_contam = latex_answer_choice_check(answer_check_dicts_contam, use_tqdm=True)
results_metadata_contam = make_results_df(responses_contam, answer_check_results_contam, dataset_map, mode="mcq")

sum([x["is_correct"] for x in answer_check_results_contam]) / len(dataset) * 100

  0%|          | 0/4110 [00:00<?, ?it/s]

19.97566909975669

In [63]:
os.makedirs(BASEDIR / "results/gemini-1.5-pro-002", exist_ok=True)
results_metadata_contam.to_csv(BASEDIR / "results/gemini-1.5-pro-002/results_mcq_nochoices.csv")

In [37]:
results_metadata_contam.value_counts("reason")

reason
stop         4099
length          9
copyright       2
Name: count, dtype: int64

In [40]:
accuracy_by_split(results_metadata_contam, "level")

Unnamed: 0_level_0,count,accuracy
level,Unnamed: 1_level_1,Unnamed: 2_level_1
1,858,19.93007
2,1612,18.98263
3,1136,20.246479
4,504,22.619048


In [41]:
# Got it right
rand_uid = "1951/AHSME/39"
print(rand_uid)
print(dataset_map[rand_uid]["answer_choice"])
print(dataset_map[rand_uid]["answer"])
print(dataset_map[rand_uid]["choices"])
print(response_contam_map[rand_uid]["prompt"][-2]["parts"][0]["text"])
display(Markdown(response_contam_map[rand_uid]["response"].completions[0].completion))

1951/AHSME/39
A
$784\text{ ft.}$
{'A': '$784\\text{ ft.}$', 'B': '$342\\text{ ft.}$', 'C': '$1568\\text{ ft.}$', 'D': '$156.8\\text{ ft.}$', 'E': '$\\text{none of these}$'}
Problem:
A stone is dropped into a well and the report of the stone striking the bottom is heard $7.7$ seconds after it is dropped. Assume that the stone falls $16t^2$ feet in t seconds and that the velocity of sound is $1120$ feet per second. The depth of the well is:


Let $d$ be the depth of the well in feet.
Let $t_1$ be the time in seconds it takes for the stone to reach the bottom of the well.
Let $t_2$ be the time in seconds it takes for the sound to travel from the bottom of the well to the top.
We are given that the total time is $7.7$ seconds, so $t_1 + t_2 = 7.7$.

The distance the stone falls is given by $d = 16t_1^2$.
The distance the sound travels is also $d$, and the time it takes is $t_2 = \frac{d}{1120}$.

We have the system of equations:
$t_1 + t_2 = 7.7$
$d = 16t_1^2$
$d = 1120t_2$

From the first equation, we have $t_2 = 7.7 - t_1$.
Substituting this into the third equation, we get $d = 1120(7.7 - t_1)$.
Now we have two equations for $d$:
$d = 16t_1^2$
$d = 1120(7.7 - t_1)$

Setting them equal to each other, we get:
$16t_1^2 = 1120(7.7 - t_1)$
$16t_1^2 = 8624 - 1120t_1$
$16t_1^2 + 1120t_1 - 8624 = 0$
Dividing by 16, we get:
$t_1^2 + 70t_1 - 539 = 0$

We can use the quadratic formula to solve for $t_1$:
$t_1 = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a}$
$t_1 = \frac{-70 \pm \sqrt{70^2 - 4(1)(-539)}}{2(1)}$
$t_1 = \frac{-70 \pm \sqrt{4900 + 2156}}{2}$
$t_1 = \frac{-70 \pm \sqrt{7056}}{2}$
$t_1 = \frac{-70 \pm 84}{2}$
Since $t_1$ must be positive, we take the positive root:
$t_1 = \frac{-70 + 84}{2} = \frac{14}{2} = 7$

Now we can find the depth $d$:
$d = 16t_1^2 = 16(7^2) = 16(49) = 784$

Final Answer: The final answer is A. 

In [42]:
# Got the right answer but wrong answer choice
rand_uid = "1993/AHSME/27"  # other examples: 1998/AHSME/23, 2024/AMC_8/11, 2013/AMC_10A/2
print(rand_uid)
print(dataset_map[rand_uid]["answer_choice"])
print(dataset_map[rand_uid]["answer"])
print(dataset_map[rand_uid]["choices"])
print(response_contam_map[rand_uid]["prompt"][-2]["parts"][0]["text"])
display(Markdown(response_contam_map[rand_uid]["response"].completions[0].completion))

1993/AHSME/27
B
$12$
{'A': '$10$', 'B': '$12$', 'C': '$14$', 'D': '$15$', 'E': '$17$'}
Problem:
[asy] draw(circle((4,1),1),black+linewidth(.75)); draw((0,0)--(8,0)--(8,6)--cycle,black+linewidth(.75)); MP("A",(0,0),SW);MP("B",(8,0),SE);MP("C",(8,6),NE);MP("P",(4,1),NW); MP("8",(4,0),S);MP("6",(8,3),E);MP("10",(4,3),NW); MP("->",(5,1),E); dot((4,1)); [/asy] The sides of $\triangle ABC$ have lengths $6,8,$ and $10$. A circle with center $P$ and radius $1$ rolls around the inside of $\triangle ABC$, always remaining tangent to at least one side of the triangle. When $P$ first returns to its original position, through what distance has $P$ traveled?


Let $a, b, c$ be the lengths of the sides opposite to vertices $A, B, C$ respectively. We are given $a=6$, $b=8$, and $c=10$.
Since $6^2 + 8^2 = 36 + 64 = 100 = 10^2$, $\triangle ABC$ is a right triangle with $\angle A = 90^\circ$.
The inradius $r$ of $\triangle ABC$ is given by $r = \frac{a+b-c}{2} = \frac{6+8-10}{2} = \frac{4}{2} = 2$.
The circle with center $P$ and radius $1$ rolls around the inside of $\triangle ABC$. The path of $P$ is always at a distance of $1$ from the sides of the triangle.
The length of the path is the perimeter of a triangle similar to $\triangle ABC$ with inradius $r-1 = 2-1 = 1$.
The perimeter of $\triangle ABC$ is $6+8+10 = 24$.
Let $P$ be the perimeter of the smaller triangle.
The ratio of the inradii is $\frac{1}{2}$.
The ratio of the perimeters is also $\frac{1}{2}$.
So, the perimeter of the smaller triangle is $\frac{1}{2} \times 24 = 12$.
The distance traveled by $P$ is the perimeter of the smaller triangle, which is $12$.

Final Answer: The final answer is A. 

In [43]:
# Got a wrong answer, but the right answer choice to correspond to that answer
rand_uid = "2006/AMC_8/17"
print(rand_uid)
print(dataset_map[rand_uid]["answer_choice"])
print(dataset_map[rand_uid]["answer"])
print(dataset_map[rand_uid]["choices"])
print(response_contam_map[rand_uid]["prompt"][-2]["parts"][0]["text"])
display(Markdown(response_contam_map[rand_uid]["response"].completions[0].completion))

2006/AMC_8/17
B
$\frac{1}{3}$
{'A': '$\\frac{1}{4}$', 'B': '$\\frac{1}{3}$', 'C': '$\\frac{1}{2}$', 'D': '$\\frac{2}{3}$', 'E': '$\\frac{3}{4}$'}
Problem:
Jeff rotates spinners $P$, $Q$ and $R$ and adds the resulting numbers. What is the probability that his sum is an odd number? [asy] size(200); path circle=circle((0,0),2); path r=(0,0)--(0,2); draw(circle,linewidth(1)); draw(shift(5,0)*circle,linewidth(1)); draw(shift(10,0)*circle,linewidth(1)); draw(r,linewidth(1)); draw(rotate(120)*r,linewidth(1)); draw(rotate(240)*r,linewidth(1)); draw(shift(5,0)*r,linewidth(1)); draw(shift(5,0)*rotate(90)*r,linewidth(1)); draw(shift(5,0)*rotate(180)*r,linewidth(1)); draw(shift(5,0)*rotate(270)*r,linewidth(1)); draw(shift(10,0)*r,linewidth(1)); draw(shift(10,0)*rotate(60)*r,linewidth(1)); draw(shift(10,0)*rotate(120)*r,linewidth(1)); draw(shift(10,0)*rotate(180)*r,linewidth(1)); draw(shift(10,0)*rotate(240)*r,linewidth(1)); draw(shift(10,0)*rotate(300)*r,linewidth(1)); label("$P$", (-2,2)); label(

Spinner $P$ has the numbers 1, 2, and 3.
Spinner $Q$ has the numbers 2, 4, 6, and 8.
Spinner $R$ has the numbers 1, 3, 5, 7, 9, and 11.

The possible outcomes for spinner $P$ are 1, 2, 3.
The possible outcomes for spinner $Q$ are 2, 4, 6, 8.
The possible outcomes for spinner $R$ are 1, 3, 5, 7, 9, 11.

The total number of outcomes is $3 \times 4 \times 6 = 72$.

We want to find the number of outcomes where the sum is odd.
The sum of three numbers is odd if either one or three of the numbers are odd.

Case 1: One number is odd.
If $P$ is odd and $Q$ and $R$ are even, there are $2 \times 0 \times 0 = 0$ outcomes.
If $P$ is even and $Q$ is odd and $R$ is even, there are $1 \times 0 \times 0 = 0$ outcomes.
If $P$ is even and $Q$ is even and $R$ is odd, there are $1 \times 4 \times 3 = 12$ outcomes.
If $P$ is odd and $Q$ is odd and $R$ is even, there are $2 \times 0 \times 0 = 0$ outcomes.
If $P$ is odd and $Q$ is even and $R$ is odd, there are $2 \times 4 \times 3 = 24$ outcomes.

Case 2: Three numbers are odd.
If $P$ is odd, $Q$ is odd, and $R$ is odd, there are $2 \times 0 \times 3 = 0$ outcomes.

The number of outcomes where the sum is odd is $12 + 24 = 36$.
The probability that the sum is odd is $\frac{36}{72} = \frac{1}{2}$.

Final Answer: The final answer is C. 

In [44]:
# Got an answer that was not an original answer choice. Seems to have just picked a letter
rand_uid = "2021/AMC_12B/25"
print(rand_uid)
print(dataset_map[rand_uid]["answer_choice"])
print(dataset_map[rand_uid]["answer"])
print(dataset_map[rand_uid]["choices"])
print(response_contam_map[rand_uid]["prompt"][-2]["parts"][0]["text"])
display(Markdown(response_contam_map[rand_uid]["response"].completions[0].completion))

2021/AMC_12B/25
E
$85$
{'A': '$31$', 'B': '$47$', 'C': '$62$', 'D': '$72$', 'E': '$85$'}
Problem:
Let $S$ be the set of lattice points in the coordinate plane, both of whose coordinates are integers between $1$ and $30,$ inclusive. Exactly $300$ points in $S$ lie on or below a line with equation $y=mx.$ The possible values of $m$ lie in an interval of length $\frac ab,$ where $a$ and $b$ are relatively prime positive integers. What is $a+b?$


Let $S$ be the set of lattice points $(x,y)$ such that $1 \le x \le 30$ and $1 \le y \le 30$.
The number of lattice points in $S$ is $30 \times 30 = 900$.
We are given that $300$ points in $S$ lie on or below the line $y=mx$.

If $m=1$, the number of lattice points on or below the line $y=x$ is given by $\sum_{x=1}^{30} x = \frac{30(31)}{2} = 465$.
If $m=2$, the number of lattice points on or below the line $y=2x$ is given by $\sum_{x=1}^{15} 30 + \sum_{x=16}^{30} (2x) = 15(30) + 2\sum_{x=1}^{30} x - 2\sum_{x=1}^{15} x = 450 + 2(465) - 2\frac{15(16)}{2} = 450 + 930 - 240 = 1140$.
If $m=\frac{1}{2}$, the number of lattice points on or below the line $y=\frac{1}{2}x$ is given by $\sum_{x=1}^{30} \lfloor \frac{x}{2} \rfloor = \sum_{x=1}^{15} \lfloor \frac{2x-1}{2} \rfloor + \sum_{x=1}^{15} \lfloor \frac{2x}{2} \rfloor = \sum_{x=1}^{15} (x-1) + \sum_{x=1}^{15} x = 2\sum_{x=1}^{15} x - 15 = 2\frac{15(16)}{2} - 15 = 240 - 15 = 225$.

We are given that the number of lattice points on or below the line $y=mx$ is $300$.
If $m$ is small, the number of lattice points is close to $\frac{30^2}{2m}$.
If $m$ is large, the number of lattice points is close to $m\frac{30^2}{2}$.
We have $\frac{900}{2m} = 300$, so $m = \frac{900}{600} = \frac{3}{2}$.
If $m=\frac{3}{2}$, the number of lattice points is $\sum_{x=1}^{30} \lfloor \frac{3x}{2} \rfloor = \sum_{x=1}^{20} 30 + \sum_{x=21}^{30} \frac{3x}{2} = 20(30) + \frac{3}{2} \sum_{x=1}^{30} x - \frac{3}{2} \sum_{x=1}^{20} x = 600 + \frac{3}{2}(465) - \frac{3}{2}(210) = 600 + 697.5 - 315 = 982.5$.
We have $\frac{900m}{2} = 300$, so $m = \frac{600}{900} = \frac{2}{3}$.
If $m=\frac{2}{3}$, the number of lattice points is $\sum_{x=1}^{30} \lfloor \frac{2x}{3} \rfloor = 285$.

The possible values of $m$ are between $\frac{2}{3}$ and $\frac{3}{2}$.
The length of the interval is $\frac{3}{2} - \frac{2}{3} = \frac{9-4}{6} = \frac{5}{6}$.
$a=5$ and $b=6$.
$a+b = 5+6 = 11$.

Final Answer: The final answer is C. 

In [45]:
# This problem requires more context to solve (i.e. the choices)
# Gemini hallucinates more to the problem
rand_uid = "1953/AHSME/47"
print(rand_uid)
print(dataset_map[rand_uid]["answer_choice"])
print(dataset_map[rand_uid]["answer"])
print(dataset_map[rand_uid]["choices"])
print(response_contam_map[rand_uid]["prompt"][-2]["parts"][0]["text"])
display(Markdown(response_contam_map[rand_uid]["response"].completions[0].completion))

1953/AHSME/47
D
$\log (1+x) < x$
{'A': '$\\log (1+x) = \\frac{x}{1+x}$', 'B': '$\\log (1+x) < \\frac{x}{1+x}$', 'C': '$\\log(1+x) > x$', 'D': '$\\log (1+x) < x$', 'E': '$\\text{none of these}$'}
Problem:
If $x>0$, then the correct relationship is:


We are given that $x > 0$. We need to find the correct relationship between $x$, $\sqrt{x}$, and $x^2$.

Let's consider some examples.

Case 1: $x = 1$
$x = 1$
$\sqrt{x} = \sqrt{1} = 1$
$x^2 = 1^2 = 1$
In this case, $x = \sqrt{x} = x^2$.

Case 2: $x = 4$
$x = 4$
$\sqrt{x} = \sqrt{4} = 2$
$x^2 = 4^2 = 16$
In this case, $\sqrt{x} < x < x^2$.

Case 3: $x = \frac{1}{4}$
$x = \frac{1}{4}$
$\sqrt{x} = \sqrt{\frac{1}{4}} = \frac{1}{2}$
$x^2 = (\frac{1}{4})^2 = \frac{1}{16}$
In this case, $x^2 < x < \sqrt{x}$.

Case 4: $x = \frac{1}{9}$
$x = \frac{1}{9}$
$\sqrt{x} = \sqrt{\frac{1}{9}} = \frac{1}{3}$
$x^2 = (\frac{1}{9})^2 = \frac{1}{81}$
In this case, $x^2 < x < \sqrt{x}$.

If $0 < x < 1$, then $x^2 < x < \sqrt{x}$.
If $x = 1$, then $x^2 = x = \sqrt{x}$.
If $x > 1$, then $x^2 > x > \sqrt{x}$.

The problem states that $x > 0$.
If $0 < x < 1$, then $x^2 < x$ and $\sqrt{x} > x$.
If $x = 1$, then $x^2 = x = \sqrt{x}$.
If $x > 1$, then $x^2 > x$ and $\sqrt{x} < x$.

The given options are:
A. $x^2 > x > \sqrt{x}$
B. $x^2 > \sqrt{x} > x$
C. $x > x^2 > \sqrt{x}$
D. $\sqrt{x} > x > x^2$
E. None of these

If $x > 1$, then $x^2 > x > \sqrt{x}$, which is option A.
If $0 < x < 1$, then $\sqrt{x} > x > x^2$, which is option D.
If $x = 1$, then $x^2 = x = \sqrt{x}$.

Since the problem only states $x > 0$, we cannot determine a single correct relationship.

Final Answer: The final answer is E. 

In [60]:
# Hallucinated choices
# Suspiciously some of these choices are correct (A-C), but the others aren't
# Maybe luckily, gets the right answer
rand_uid = "1952/AHSME/20"
print(rand_uid)
print(dataset_map[rand_uid]["answer_choice"])
print(dataset_map[rand_uid]["answer"])
print(dataset_map[rand_uid]["choices"])
print(response_contam_map[rand_uid]["prompt"][-2]["parts"][0]["text"])
display(Markdown(response_contam_map[rand_uid]["response"].completions[0].completion))

1952/AHSME/20
E
$\frac{x-y}{y}=\frac{1}{4}$
{'A': '$\\frac{x+y}{y}=\\frac{7}{4}$', 'B': '$\\frac{y}{y-x}=\\frac{4}{1}$', 'C': '$\\frac{x+2y}{x}=\\frac{11}{3}$', 'D': '$\\frac{x}{2y}=\\frac{3}{8}$', 'E': '$\\frac{x-y}{y}=\\frac{1}{4}$'}
Problem:
If $\frac{x}{y}=\frac{3}{4}$, then the incorrect expression in the following is:


Given $\frac{x}{y} = \frac{3}{4}$, we can write $x = 3k$ and $y = 4k$ for some non-zero constant $k$.

A. $\frac{x+y}{y} = \frac{3k+4k}{4k} = \frac{7k}{4k} = \frac{7}{4}$
B. $\frac{y}{y-x} = \frac{4k}{4k-3k} = \frac{4k}{k} = 4$
C. $\frac{x+2y}{x} = \frac{3k+2(4k)}{3k} = \frac{3k+8k}{3k} = \frac{11k}{3k} = \frac{11}{3}$
D. $\frac{x}{2y-x} = \frac{3k}{2(4k)-3k} = \frac{3k}{8k-3k} = \frac{3k}{5k} = \frac{3}{5}$
E. $\frac{x}{3y}=\frac{3k}{3(4k)} = \frac{3k}{12k} = \frac{1}{4}$

The given options are:
A. $\frac{7}{4}$
B. $4$
C. $\frac{11}{3}$
D. $\frac{3}{5}$
E. $\frac{1}{3}$

Comparing the calculated values with the given options, we see that option E is incorrect.

Final Answer: The final answer is E. 

In [48]:
# Ignored the system prompt about returning a letter A-E
rand_uid = "2017/AMC_12A/2"
print(rand_uid)
print(dataset_map[rand_uid]["answer_choice"])
print(dataset_map[rand_uid]["answer"])
print(dataset_map[rand_uid]["choices"])
print(response_contam_map[rand_uid]["prompt"][-2]["parts"][0]["text"])
display(Markdown(response_contam_map[rand_uid]["response"].completions[0].completion))

2017/AMC_12A/2
C
$4$
{'A': '$1$', 'B': '$2$', 'C': '$4$', 'D': '$8$', 'E': '$12$'}
Problem:
The sum of two nonzero real numbers is 4 times their product. What is the sum of the reciprocals of the two numbers?


Let the two nonzero real numbers be $x$ and $y$.
We are given that their sum is 4 times their product, which can be written as:
$x + y = 4xy$

We want to find the sum of the reciprocals of the two numbers, which is $\frac{1}{x} + \frac{1}{y}$.
We can rewrite this sum as a single fraction:
$\frac{1}{x} + \frac{1}{y} = \frac{y}{xy} + \frac{x}{xy} = \frac{x+y}{xy}$

We are given that $x + y = 4xy$. We can substitute this into the expression for the sum of the reciprocals:
$\frac{x+y}{xy} = \frac{4xy}{xy}$

Since $x$ and $y$ are nonzero, $xy \neq 0$, so we can cancel the $xy$ terms:
$\frac{4xy}{xy} = 4$

So, the sum of the reciprocals of the two numbers is 4.

Final Answer: The final answer is 4. 

In [61]:
rand_uid = np.random.choice(results_metadata_contam.query("reason == 'stop'").index)
print(rand_uid)
print(dataset_map[rand_uid]["answer_choice"])
print(dataset_map[rand_uid]["answer"])
print(dataset_map[rand_uid]["choices"])
print(response_contam_map[rand_uid]["prompt"][-2]["parts"][0]["text"])
display(Markdown(response_contam_map[rand_uid]["response"].completions[0].completion))

1957/AHSME/20
A
$47\frac{7}{19}$
{'A': '$47\\frac{7}{19}$', 'B': '$47\\frac{1}{4}$', 'C': '$47\\frac{1}{2}$', 'D': '$47\\frac{11}{19}$', 'E': '$\\text{none of these}$'}
Problem:
A man makes a trip by automobile at an average speed of 50 mph. He returns over the same route at an average speed of $45$ mph. His average speed for the entire trip is:


Let $d$ be the distance of the route.
Let $t_1$ be the time taken for the first trip, and $t_2$ be the time taken for the return trip.

For the first trip, we have $d = 50t_1$, so $t_1 = \frac{d}{50}$.
For the return trip, we have $d = 45t_2$, so $t_2 = \frac{d}{45}$.

The total distance is $2d$, and the total time is $t_1 + t_2 = \frac{d}{50} + \frac{d}{45}$.
The average speed for the entire trip is given by the total distance divided by the total time:
$$ \text{Average speed} = \frac{2d}{\frac{d}{50} + \frac{d}{45}} = \frac{2d}{\frac{45d + 50d}{50 \cdot 45}} = \frac{2d}{\frac{95d}{2250}} = \frac{2d \cdot 2250}{95d} = \frac{4500}{95} = \frac{900}{19} \approx 47.368 $$

The average speed is $\frac{2}{\frac{1}{50} + \frac{1}{45}} = \frac{2}{\frac{45+50}{50 \times 45}} = \frac{2 \times 50 \times 45}{95} = \frac{4500}{95} = \frac{900}{19} \approx 47.37$ mph.

Final Answer: The final answer is C. 