In [1]:
import sys
from pathlib import Path

BASEDIR = Path("/workspaces/HARP/") / "src"  # Replace with your own basedir path for the repo

sys.path.insert(0, str(BASEDIR))

In [2]:
from __future__ import annotations

import copy
import itertools
import json
import math
import os
import pickle
import pprint
import re
import textwrap
import time
import traceback
from collections import Counter, defaultdict
from typing import Any

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tiktoken
from IPython.display import Markdown, clear_output, display
from tqdm.auto import tqdm

import vertexai
from vertexai.batch_prediction._batch_prediction import BatchPredictionJob

In [3]:
from eval.api import safe_unified_api_call
from eval.costs import count_tokens, get_pricing
from eval.eval import run_one, create_batch, make_answer_check_dict_from_jsonl, make_results_df, accuracy_by_split
from eval.parsing_lib import *
from eval.latex_answer_check import *
from eval.prompt import create_prompt
from eval.prompts import *
from eval.response import ModelResponse
from eval.utils import AMC_LETTER_CHOICES, read_jsonl, write_jsonl, get_uid, upload_blob, download_blob

# Data

In [4]:
dataset = [
    p 
    for p in read_jsonl(BASEDIR / "data/processed/HARP_raw.jsonl") 
    if (
        not p["contest"].endswith("MO") 
        and not p["multiple_choice_only"]
    )
]
dataset_map = {get_uid(p): p for p in dataset}

In [5]:
level_to_problems = {}
for p in dataset:
    if p["subject"] == "calculus":
        continue
    
    key = p["level"]
    if key not in level_to_problems:
        level_to_problems[key] = []
    level_to_problems[key].append(p)

subject_to_problems = {}
for p in dataset:
    if p["subject"] == "calculus":
        continue
    
    key = p["subject"]
    if key not in subject_to_problems:
        subject_to_problems[key] = []
    subject_to_problems[key].append(p)

In [6]:
{k: len(v) for k, v in level_to_problems.items()}

{2: 1505, 3: 1363, 4: 719, 5: 197, 6: 197, 1: 799}

In [7]:
{k: len(v) for k, v in subject_to_problems.items()}

{'prealgebra': 889,
 'algebra': 970,
 'geometry': 1268,
 'counting_and_probability': 812,
 'number_theory': 601,
 'precalculus': 240}

In [8]:
36*36*20

25920

# Init eval things

In [9]:
vertexai.init(project=os.environ.get("VERTEXAI_PROJECT_ID"), location="us-central1")

In [10]:
BUCKET_NAME = os.environ.get("GCLOUD_BUCKET_NAME")  # Should have the form "cloud-ai-platform-<YOUR_BUCKET>"

# ICL by level

In [36]:
# picked_bylevel_problems = {}
# for lvl, problems in level_to_problems.items():
#     picked = np.random.choice(problems, size=100, replace=False)
#     picked_bylevel_problems[lvl] = [get_uid(p) for p in picked]

# with open(BASEDIR / "outputs/short_answer/icl_bylevel_problems.pkl", "wb+") as f:
#     pickle.dump(picked_bylevel_problems, f)

In [9]:
with open(BASEDIR / "outputs/short_answer/icl_bylevel_problems.pkl", "rb") as f:
    picked_bylevel_problems = pickle.load(f)

In [10]:
remaining_bylevel_problems = {}
for lvl, problems in level_to_problems.items():
    remaining_bylevel_problems[lvl] = [p for p in problems if get_uid(p) not in picked_bylevel_problems[lvl]]

In [11]:
picked_problems = [dataset_map[uid] for uid in list(itertools.chain(*picked_bylevel_problems.values()))]

## Create and run batch

In [13]:
lvl_to_batch = {}
for icl_lvl, avail_problems in remaining_bylevel_problems.items():
    batch = []
    for prob in picked_problems:
        picked = np.random.choice(avail_problems, size=4, replace=False)
        messages = []
        for example in picked:
            messages.append({
                "role": "user",
                "content": f"Problem:\n{example['problem']}"
            })
            messages.append({
                "role": "assistant",
                "content": f"Solution:\n{example['solution_1']}\nFinal Answer: The final answer is {example['answer']}. I hope it is correct."
            })
        
        uid = get_uid(prob)
        request = run_one(
            prob,
            api="google",
            model="gemini-1.5-pro-002",
            fewshot_messages=messages,
            system_prompt=gemini_sysprompt,
            max_tokens=2048,
            num_completions=1,
            temperature=0,
            seed=0,
            stop_sequences=["I hope it is correct."],
            return_params=True,
            custom_id=uid,
            # just to remove irrelevant params
            logprobs=None,
            top_p=None,
        )
        batch.append(request)
    lvl_to_batch[icl_lvl] = batch

In [55]:
for lvl, batch in lvl_to_batch.items():
    write_jsonl(
        batch,
        BASEDIR / f"outputs/short_answer/gemini-1.5-pro-002/batch_icl-level_example-lvl-{lvl}.jsonl",
    )

In [None]:
for lvl in range(1, 7):
    upload_blob(
        BUCKET_NAME,
        BASEDIR / f"outputs/short_answer/gemini-1.5-pro-002/batch_icl-level_example-lvl-{lvl}.jsonl",
        f"prompt_data/short_answer/gemini-1.5-pro-002/batch_icl-level_example-lvl-{lvl}.jsonl",
    )

In [None]:
for lvl in range(1, 7):
    BatchPredictionJob.submit(
        source_model="gemini-1.5-pro-002",
        input_dataset=f"gs://{BUCKET_NAME}/prompt_data/short_answer/gemini-1.5-pro-002/batch_icl-level_example-lvl-{lvl}.jsonl",
        output_uri_prefix=f"gs://{BUCKET_NAME}/outputs/short_answer/gemini-1.5-pro-002/batch_icl-level_example-lvl-{lvl}",
    )

In [None]:
download_blob(
    BUCKET_NAME,
    "outputs/short_answer/gemini-1.5-pro-002/batch_icl-level_example-lvl-1/prediction-model-<TIMESTAMP>/predictions.jsonl",
    BASEDIR / "outputs/short_answer/gemini-1.5-pro-002/outputs_icl-level_lvl1.jsonl",
)
download_blob(
    BUCKET_NAME,
    "outputs/short_answer/gemini-1.5-pro-002/batch_icl-level_example-lvl-2/prediction-model-<TIMESTAMP>/predictions.jsonl",
    BASEDIR / "outputs/short_answer/gemini-1.5-pro-002/outputs_icl-level_lvl2.jsonl",
)
download_blob(
    BUCKET_NAME,
    "outputs/short_answer/gemini-1.5-pro-002/batch_icl-level_example-lvl-3/prediction-model-<TIMESTAMP>/predictions.jsonl",
    BASEDIR / "outputs/short_answer/gemini-1.5-pro-002/outputs_icl-level_lvl3.jsonl",
)
download_blob(
    BUCKET_NAME,
    "outputs/short_answer/gemini-1.5-pro-002/batch_icl-level_example-lvl-4/prediction-model-<TIMESTAMP>/predictions.jsonl",
    BASEDIR / "outputs/short_answer/gemini-1.5-pro-002/outputs_icl-level_lvl4.jsonl",
)
download_blob(
    BUCKET_NAME,
    "outputs/short_answer/gemini-1.5-pro-002/batch_icl-level_example-lvl-5/prediction-model-<TIMESTAMP>/predictions.jsonl",
    BASEDIR / "outputs/short_answer/gemini-1.5-pro-002/outputs_icl-level_lvl5.jsonl",
)
download_blob(
    BUCKET_NAME,
    "outputs/short_answer/gemini-1.5-pro-002/batch_icl-level_example-lvl-6/prediction-model-<TIMESTAMP>/predictions.jsonl",
    BASEDIR / "outputs/short_answer/gemini-1.5-pro-002/outputs_icl-level_lvl6.jsonl",
)

## Results

In [12]:
lvl_raw_responses = []
lvl_responses = []
lvl_raw_response_map = []
lvl_response_map = []
lvl_answer_check_results = []
lvl_results = []

os.makedirs(BASEDIR / "results/gemini-1.5-pro-002", exist_ok=True)
for i in range(1, 7):
    fname = BASEDIR / f"outputs/short_answer/gemini-1.5-pro-002/outputs_icl-level_lvl{i}.jsonl"

    raw_responses_lvl_i = read_jsonl(fname)
    responses_lvl_i = [
        {
            "uid": x["custom_id"],
            "system": x["request"]["system_instruction"],
            "prompt": x["request"]["contents"],
            "response": ModelResponse.from_response(x["response"], "google", use_batch_api=True)
        }
        for x in raw_responses_lvl_i
    ]
    raw_response_lvl_i_map = {o["uid"]: raw for raw, o in zip(raw_responses_lvl_i, responses_lvl_i)}
    response_lvl_i_map = {o["uid"]: o for o in responses_lvl_i}

    answer_check_dicts_lvl_i = make_answer_check_dict_from_jsonl(responses_lvl_i, dataset_map)
    
    answer_check_results_lvl_i = latex_answer_check(answer_check_dicts_lvl_i, use_tqdm=True)
    
    results_lvl_i_metadata = make_results_df(
        responses_lvl_i, answer_check_results_lvl_i, dataset_map
    )

    results_lvl_i_metadata.to_csv(BASEDIR / f"results/gemini-1.5-pro-002/results_sa_icl-lvl_{i}.csv")

    lvl_raw_responses.append(raw_responses_lvl_i)
    lvl_responses.append(responses_lvl_i)
    lvl_raw_response_map.append(raw_response_lvl_i_map)
    lvl_response_map.append(response_lvl_i_map)
    lvl_answer_check_results.append(answer_check_results_lvl_i)
    lvl_results.append(results_lvl_i_metadata)

  0%|          | 0/600 [00:00<?, ?it/s]

  0%|          | 0/600 [00:00<?, ?it/s]

  0%|          | 0/600 [00:00<?, ?it/s]

  0%|          | 0/600 [00:00<?, ?it/s]

  0%|          | 0/600 [00:00<?, ?it/s]

  0%|          | 0/600 [00:00<?, ?it/s]

In [13]:
for i in range(6):
    print(f'Demonstrations of level {i+1} has accuracy {100*lvl_results[i]["is_correct"].mean():.02f}')

Demonstrations of level 1 has accuracy 45.17
Demonstrations of level 2 has accuracy 44.00
Demonstrations of level 3 has accuracy 45.00
Demonstrations of level 4 has accuracy 46.17
Demonstrations of level 5 has accuracy 47.83
Demonstrations of level 6 has accuracy 46.33


In [14]:
bylevel_matrix = []
for i in range(6):
    bylevel_matrix.append(
        lvl_results[i].groupby("level")["is_correct"].mean().to_frame().rename(columns={"is_correct": i+1})
    )
bylevel_matrix = pd.concat(bylevel_matrix, axis=1)
bylevel_matrix * 100

Unnamed: 0_level_0,1,2,3,4,5,6
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,73.0,72.0,77.0,74.0,78.0,77.0
2,77.0,72.0,71.0,72.0,75.0,75.0
3,58.0,57.0,56.0,53.0,59.0,52.0
4,42.0,40.0,44.0,45.0,44.0,44.0
5,16.0,16.0,14.0,21.0,21.0,21.0
6,5.0,7.0,8.0,12.0,10.0,9.0


# ICL by subject

In [15]:
SUBJECTS = [
    'prealgebra',
    'algebra',
    'geometry',
    'counting_and_probability',
    'number_theory',
    'precalculus',
]

In [60]:
# picked_bysubject_problems = {}
# for subj, problems in subject_to_problems.items():
#     picked = np.random.choice(problems, size=100, replace=False)
#     picked_bysubject_problems[subj] = [get_uid(p) for p in picked]

# with open(BASEDIR / "outputs/short_answer/icl_bysubject_problems.pkl", "wb+") as f:
#     pickle.dump(picked_bysubject_problems, f)

In [16]:
with open(BASEDIR / "outputs/short_answer/icl_bysubject_problems.pkl", "rb") as f:
    picked_bysubject_problems = pickle.load(f)

In [17]:
remaining_bysubject_problems = {}
for subj, problems in subject_to_problems.items():
    remaining_bysubject_problems[subj] = [p for p in problems if get_uid(p) not in picked_bysubject_problems[subj]]

In [18]:
{k: len(v) for k, v in remaining_bysubject_problems.items()}

{'prealgebra': 789,
 'algebra': 870,
 'geometry': 1168,
 'counting_and_probability': 712,
 'number_theory': 501,
 'precalculus': 140}

In [19]:
picked_subj_problems = [dataset_map[uid] for uid in list(itertools.chain(*picked_bysubject_problems.values()))]

## Create and run batch

In [68]:
subj_to_batch = {}
for icl_subj, avail_problems in remaining_bysubject_problems.items():
    batch = []
    for prob in picked_subj_problems:
        picked = np.random.choice(avail_problems, size=4, replace=False)
        messages = []
        for example in picked:
            messages.append({
                "role": "user",
                "content": f"Problem:\n{example['problem']}"
            })
            messages.append({
                "role": "assistant",
                "content": f"Solution:\n{example['solution_1']}\nFinal Answer: The final answer is {example['answer']}. I hope it is correct."
            })
        
        uid = get_uid(prob)
        request = run_one(
            prob,
            api="google",
            model="gemini-1.5-pro-002",
            fewshot_messages=messages,
            system_prompt=gemini_sysprompt,
            max_tokens=2048,
            num_completions=1,
            temperature=0,
            seed=0,
            stop_sequences=["I hope it is correct."],
            return_params=True,
            custom_id=uid,
            # just to remove irrelevant params
            logprobs=None,
            top_p=None,
        )
        batch.append(request)
    subj_to_batch[icl_subj] = batch

In [71]:
for subj, batch in subj_to_batch.items():
    write_jsonl(
        batch,
        BASEDIR / f"outputs/short_answer/gemini-1.5-pro-002/batch_icl-subject_{subj}.jsonl",
    )

In [None]:
for subj in subj_to_batch.keys():
    upload_blob(
        BUCKET_NAME,
        BASEDIR / f"outputs/short_answer/gemini-1.5-pro-002/batch_icl-subject_{subj}.jsonl",
        f"prompt_data/short_answer/gemini-1.5-pro-002/batch_icl-subject_{subj}.jsonl",
    )

In [None]:
for subj in subj_to_batch.keys():
    BatchPredictionJob.submit(
        source_model="gemini-1.5-pro-002",
        input_dataset=f"gs://{BUCKET_NAME}/prompt_data/short_answer/gemini-1.5-pro-002/batch_icl-subject_{subj}.jsonl",
        output_uri_prefix=f"gs://{BUCKET_NAME}/outputs/short_answer/gemini-1.5-pro-002/batch_icl-subject_{subj}",
    )

In [None]:
download_blob(
    BUCKET_NAME,
    "outputs/short_answer/gemini-1.5-pro-002/batch_icl-subject_algebra/prediction-model-<TIMESTAMP>/predictions.jsonl",
    BASEDIR / "outputs/short_answer/gemini-1.5-pro-002/outputs_icl-subject_algebra.jsonl",
)
download_blob(
    BUCKET_NAME,
    "outputs/short_answer/gemini-1.5-pro-002/batch_icl-subject_counting_and_probability/prediction-model-<TIMESTAMP>/predictions.jsonl",
    BASEDIR / "outputs/short_answer/gemini-1.5-pro-002/outputs_icl-subject_counting_and_probability.jsonl",
)
download_blob(
    BUCKET_NAME,
    "outputs/short_answer/gemini-1.5-pro-002/batch_icl-subject_geometry/prediction-model-<TIMESTAMP>/predictions.jsonl",
    BASEDIR / "outputs/short_answer/gemini-1.5-pro-002/outputs_icl-subject_geometry.jsonl",
)
download_blob(
    BUCKET_NAME,
    "outputs/short_answer/gemini-1.5-pro-002/batch_icl-subject_number_theory/prediction-model-<TIMESTAMP>/predictions.jsonl",
    BASEDIR / "outputs/short_answer/gemini-1.5-pro-002/outputs_icl-subject_number_theory.jsonl",
)
download_blob(
    BUCKET_NAME,
    "outputs/short_answer/gemini-1.5-pro-002/batch_icl-subject_prealgebra/prediction-model-<TIMESTAMP>/predictions.jsonl",
    BASEDIR / "outputs/short_answer/gemini-1.5-pro-002/outputs_icl-subject_prealgebra.jsonl",
)
download_blob(
    BUCKET_NAME,
    "outputs/short_answer/gemini-1.5-pro-002/batch_icl-subject_precalculus/prediction-model-<TIMESTAMP>/predictions.jsonl",
    BASEDIR / "outputs/short_answer/gemini-1.5-pro-002/outputs_icl-subject_precalculus.jsonl",
)

## Results

In [20]:
subj_raw_responses = []
subj_responses = []
subj_raw_response_map = []
subj_response_map = []
subj_answer_check_results = []
subj_results = []

os.makedirs(BASEDIR / "results/gemini-1.5-pro-002", exist_ok=True)
for subj in SUBJECTS:
    fname = BASEDIR / f"outputs/short_answer/gemini-1.5-pro-002/outputs_icl-subject_{subj}.jsonl"

    raw_responses_subj_i = read_jsonl(fname)
    responses_subj_i = [
        {
            "uid": x["custom_id"],
            "system": x["request"]["system_instruction"],
            "prompt": x["request"]["contents"],
            "response": ModelResponse.from_response(x["response"], "google", use_batch_api=True)
        }
        for x in raw_responses_subj_i
    ]
    raw_response_subj_i_map = {o["uid"]: raw for raw, o in zip(raw_responses_subj_i, responses_subj_i)}
    response_subj_i_map = {o["uid"]: o for o in responses_subj_i}

    answer_check_dicts_subj_i = make_answer_check_dict_from_jsonl(responses_subj_i, dataset_map)
    
    answer_check_results_subj_i = latex_answer_check(answer_check_dicts_subj_i, use_tqdm=True)
    
    results_subj_i_metadata = make_results_df(
        responses_subj_i, answer_check_results_subj_i, dataset_map
    )

    results_subj_i_metadata.to_csv(BASEDIR / f"results/gemini-1.5-pro-002/results_sa_icl-subject_{subj}.csv")

    subj_raw_responses.append(raw_responses_subj_i)
    subj_responses.append(responses_subj_i)
    subj_raw_response_map.append(raw_response_subj_i_map)
    subj_response_map.append(response_subj_i_map)
    subj_answer_check_results.append(answer_check_results_subj_i)
    subj_results.append(results_subj_i_metadata)

  0%|          | 0/600 [00:00<?, ?it/s]

  0%|          | 0/600 [00:00<?, ?it/s]

  0%|          | 0/600 [00:00<?, ?it/s]

Function timed out after 10 seconds
('We are looking for the least positive integer $m$ such that\n\\[m(2!)(3!)(4!)... (16!)\\]\nis a perfect square.\nWe can rewrite the expression as\n\\[m \\prod_{n=2}^{16} n! = m \\prod_{n=2}^{16} \\prod_{k=1}^n k = m \\prod_{k=1}^{16} k^{17-k} = m \\prod_{k=1}^{16} k^{16-k+1}\\]\nWe want the exponent of each prime factor to be even.\nConsider the exponent of $k$.\nIf $k$ is odd, then $17-k$ is even.\nIf $k$ is even, then $17-k$ is odd.\nThe exponent of $k$ is $17-k$.\nWe have\n\\begin{align*} \\label{eq:prod} 2! 3! 4! ... 16! &= 2^1 \\cdot (2 \\cdot 3)^2 \\cdot (2 \\cdot 3 \\cdot 4)^3 \\cdots (2 \\cdot 3 \\cdots 16)^{15} \\\\ &= 2^{1+2+...+15} 3^{2+3+...+15} ... 15^{15+16} 16^{16} \\\\ &= 2^{120} 3^{112} 4^{105} ... 15^2 16^1\\end{align*}\nThe exponents of odd numbers are $112, 96, 80, 64, 48, 32, 16, 2$.\nThe exponents of even numbers are $120, 105, 84, 66, 51, 38, 27, 18, 11, 4, 1$.\nWe have\n\\[m \\prod_{n=2}^{16} n! = m \\prod_{k=1}^{16} k^{17-k

  0%|          | 0/600 [00:00<?, ?it/s]

  0%|          | 0/600 [00:00<?, ?it/s]

  0%|          | 0/600 [00:00<?, ?it/s]

In [21]:
for i, subj in enumerate(SUBJECTS):
    print(f'Demonstrations of subject {subj} has accuracy {100*subj_results[i]["is_correct"].mean():.02f}')

Demonstrations of subject prealgebra has accuracy 56.50
Demonstrations of subject algebra has accuracy 61.17
Demonstrations of subject geometry has accuracy 60.50
Demonstrations of subject counting_and_probability has accuracy 59.00
Demonstrations of subject number_theory has accuracy 57.67
Demonstrations of subject precalculus has accuracy 58.17


In [22]:
bysubject_matrix = []
for i, subj in enumerate(SUBJECTS):
    bysubject_matrix.append(
        subj_results[i].groupby("subject")["is_correct"].mean().to_frame().rename(columns={"is_correct": subj})
    )
bysubject_matrix = pd.concat(bysubject_matrix, axis=1)
bysubject_matrix.loc[SUBJECTS] * 100

Unnamed: 0_level_0,prealgebra,algebra,geometry,counting_and_probability,number_theory,precalculus
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
prealgebra,78.0,85.0,85.0,83.0,79.0,83.0
algebra,65.0,72.0,63.0,68.0,68.0,74.0
geometry,52.0,56.0,58.0,53.0,52.0,45.0
counting_and_probability,46.0,53.0,53.0,51.0,50.0,49.0
number_theory,55.0,56.0,57.0,52.0,51.0,57.0
precalculus,43.0,45.0,47.0,47.0,46.0,41.0
