# Gemini 1.5 Pro Multiple Choice Shuffled

In [1]:
import sys
from pathlib import Path

BASEDIR = Path("/workspaces/HARP/") / "src"  # Replace with your own basedir path for the repo

sys.path.insert(0, str(BASEDIR))

In [2]:
from __future__ import annotations

import copy
import itertools
import json
import math
import os
import pickle
import pprint
import re
import textwrap
import time
import traceback
from collections import Counter, defaultdict
from typing import Any

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tiktoken
from IPython.display import Markdown, clear_output, display
from tqdm.auto import tqdm

import vertexai
from vertexai.batch_prediction._batch_prediction import BatchPredictionJob

In [19]:
from eval.api import safe_unified_api_call
from eval.costs import count_tokens, get_pricing
from eval.eval import run_one, create_batch, make_answer_check_dict_from_jsonl
from eval.parsing_lib import *
from eval.latex_answer_check import *
from eval.prompt import create_prompt
from eval.prompts import *
from eval.utils import AMC_LETTER_CHOICES, read_jsonl, write_jsonl, get_uid, upload_blob, download_blob

# Data

In [27]:
# We don't use HARP_mcq.jsonl here because we used the original ordering of answer choices
dataset = [x for x in read_jsonl(BASEDIR / "data/processed/HARP_raw.jsonl") if x["choices"] is not None]# and x["subject"] != "calculus"]
dataset_map = {get_uid(p): p for p in dataset}
len(dataset)

4115

In [5]:
all([p["answer_choice"] is not None for p in dataset])

True

In [6]:
Counter([p["level"] for p in dataset])

Counter({2: 1612, 3: 1136, 1: 858, 4: 504})

In [7]:
Counter([p["answer_choice"] for p in dataset])

Counter({'D': 991, 'C': 934, 'B': 929, 'E': 641, 'A': 615})

In [8]:
{c: x / len(dataset) for c, x in Counter([p["answer_choice"] for p in dataset]).items()}

{'C': 0.2272506082725061,
 'D': 0.2411192214111922,
 'E': 0.1559610705596107,
 'A': 0.14963503649635038,
 'B': 0.22603406326034065}

In [9]:
sum((x / len(dataset))**2 for c, x in Counter([p["answer_choice"] for p in dataset]).items())

0.2075872153255072

# Run eval

In [10]:
vertexai.init(project=os.environ.get("VERTEXAI_PROJECT_ID"), location="us-central1")

In [11]:
BUCKET_NAME = os.environ.get("GCLOUD_BUCKET_NAME")  # Should have the form "cloud-ai-platform-<YOUR_BUCKET>"

## Shuffle choices

In [12]:
def get_derangement(length=5):
    letters = AMC_LETTER_CHOICES[:length]
    while True:
        shuffle = np.random.permutation(letters)
        is_derangement = True
        for l, x in zip(letters, shuffle):
            if l == x:
                is_derangement = False
        if is_derangement:
            return [str(x) for x in shuffle] + AMC_LETTER_CHOICES[length:]

def get_distinct_derangements(n, length=5):
    # Note that there's 44 unique derangements
    res = []
    for i in range(n):
        while True:
            d = get_derangement(length)
            if d not in res:
                res.append(d)
                break
    return res

In [13]:
def create_problem_with_shuffled_choices(prob, new_order):
    new_prob = prob.copy()
    
    new_choices = {}
    for letter, new_letter in zip(AMC_LETTER_CHOICES, new_order):
        new_choices[new_letter] = new_prob["choices"][letter]
    new_choices = {l: new_choices[l] for l in AMC_LETTER_CHOICES}
    new_prob["choices"] = new_choices
    new_prob["answer_choice"] = new_order[AMC_LETTER_CHOICES.index(new_prob["answer_choice"])]
    return new_prob

In [100]:
all_derangements = {}
for i, prob in enumerate(dataset):
    choices = prob["choices"]
    length = 5
    if "none of" in choices["E"].lower() or "all of" in choices["E"].lower():
        length = 4

    seen_ds = first_derangements[i].copy()
    for i in range(3):
        d = get_derangement(length)
        while d in seen_ds:
            d = get_derangement(length)
        seen_ds.append(d)
    all_derangements[get_uid(prob)] = seen_ds

with open(BASEDIR / "outputs/mcq/gemini-1.5-pro-002/mcq_derangements.pkl", "wb+") as f:
    pickle.dump(all_derangements, f)

In [33]:
with open(BASEDIR / "outputs/mcq/gemini-1.5-pro-002/mcq_derangements.pkl", "rb") as f:
    all_derangements = pickle.load(f)

In [34]:
TOTAL_RUNS = 5

datasets = [[] for _ in range(TOTAL_RUNS)]
for prob in dataset:
    derangements = all_derangements[get_uid(prob)]
    choices = prob["choices"]
    length = 5
    if "none of" in choices["E"].lower() or "all of" in choices["E"].lower():
        length = 4

    for t in range(TOTAL_RUNS):
        datasets[t].append(
            create_problem_with_shuffled_choices(prob, derangements[t])
        )

In [35]:
datasets_map = [{get_uid(p): p for p in datasets[t]} for t in range(TOTAL_RUNS)]

In [22]:
for t in range(TOTAL_RUNS):
    print(Counter(p["answer_choice"] for p in datasets[t]))

Counter({'E': 850, 'A': 848, 'D': 818, 'C': 798, 'B': 796})
Counter({'A': 932, 'E': 857, 'C': 777, 'B': 773, 'D': 771})
Counter({'E': 854, 'A': 853, 'B': 810, 'C': 798, 'D': 795})
Counter({'A': 888, 'E': 855, 'B': 794, 'C': 792, 'D': 781})
Counter({'A': 903, 'E': 871, 'C': 802, 'B': 798, 'D': 736})


## Create batch

In [None]:
for t in range(TOTAL_RUNS):
    batch = create_batch(
        datasets[t],
        api="google",
        model="gemini-1.5-pro-002",
        fewshot_messages=[],
        system_prompt=gemini_multiple_choice_0shot_sysprompt,
        prompt_choices="newline_paren",
        max_tokens=2048,
        temperature=0,
        seed=0,
        stop_sequences=["I hope it is correct."],
        # just to remove irrelevant params
        logprobs=None,
        top_p=None,
    )
    write_jsonl(
        batch,
        BASEDIR / f"inputs/mcq/gemini-1.5-pro-002/batch_choices_newline-paren_shuffle{t+1}.jsonl",
    )

## Upload to cloud

In [None]:
for t in range(TOTAL_RUNS):
    upload_blob(
        BUCKET_NAME,
        BASEDIR / f"inputs/mcq/gemini-1.5-pro-002/batch_choices_newline-paren_shuffle{t+1}.jsonl",
        f"prompt_data/mcq/gemini-1.5-pro-002/batch_choices_newline-paren_shuffle{t+1}.jsonl",
    )

## Run batch job

In [None]:
for t in range(TOTAL_RUNS):
    BatchPredictionJob.submit(
        source_model="gemini-1.5-pro-002",
        input_dataset=f"gs://{BUCKET_NAME}/prompt_data/mcq/gemini-1.5-pro-002/batch_choices_newline-paren_shuffle{t+1}.jsonl",
        output_uri_prefix=f"gs://{BUCKET_NAME}/outputs/mcq/gemini-1.5-pro-002/batch_choices_newline-paren_shuffle{t+1}",
    )

In [None]:
download_blob(
    BUCKET_NAME,
    "outputs/mcq/gemini-1.5-pro-002/batch_choices_newline-paren_shuffle1/prediction-model-<TIMESTAMP>/predictions.jsonl",
    BASEDIR / "outputs/mcq/gemini-1.5-pro-002/outputs_newline-paren_shuffle1.jsonl",
)
download_blob(
    BUCKET_NAME,
    "outputs/mcq/gemini-1.5-pro-002/batch_choices_newline-paren_shuffle2/prediction-model-<TIMESTAMP>/predictions.jsonl",
    BASEDIR / "outputs/mcq/gemini-1.5-pro-002/outputs_newline-paren_shuffle2.jsonl",
)
download_blob(
    BUCKET_NAME,
    "outputs/mcq/gemini-1.5-pro-002/batch_choices_newline-paren_shuffle3/prediction-model-<TIMESTAMP>/predictions.jsonl",
    BASEDIR / "outputs/mcq/gemini-1.5-pro-002/outputs_newline-paren_shuffle3.jsonl",
)
download_blob(
    BUCKET_NAME,
    "outputs/mcq/gemini-1.5-pro-002/batch_choices_newline-paren_shuffle4/prediction-model-<TIMESTAMP>/predictions.jsonl",
    BASEDIR / "outputs/mcq/gemini-1.5-pro-002/outputs_newline-paren_shuffle4.jsonl",
)
download_blob(
    BUCKET_NAME,
    "outputs/mcq/gemini-1.5-pro-002/batch_choices_newline-paren_shuffle5/prediction-model-<TIMESTAMP>/predictions.jsonl",
    BASEDIR / "outputs/mcq/gemini-1.5-pro-002/outputs_newline-paren_shuffle5.jsonl",
)