In [None]:
import torch
from tqdm import tqdm
import pandas as pd
from tqdm import tqdm
from helpers import *

## Load sub-sampled test set

In [None]:
questions = read_jsonl_file("USMLE_test_samples_300.jsonl")

In [None]:
few_shot_prompts = read_jsonl_file("USMLE_few_shot_samples.jsonl")

## Parse ground-truth and store answers

In [None]:
ground_truth = []

for item in questions:
    ans_options = item["options"]
    correct_ans_option = ""
    for key,value in ans_options.items():
        if value == item["answer"]:
            correct_ans_option = key
            break
            
    ground_truth.append(correct_ans_option)

## Evaluate Zero-Shot GPT-3.5 Performance 

In [None]:
zero_shot_gpt_answers = []
for item in tqdm(questions):
    zero_shot_prompt_messages = build_zero_shot_prompt(PROMPT, item)
    answer = get_response(zero_shot_prompt_messages, model_name = "gpt-3.5-turbo", temperature = 0.0, max_tokens = 10)
    zero_shot_gpt_answers.append(answer)

In [None]:
zero_shot_gpt_predictions = [parse_answer(x) for x in zero_shot_gpt_answers]

In [None]:
print(calculate_accuracy(ground_truth, zero_shot_gpt_predictions))

## Few Shot Prompting GPT-3.5

In [None]:
few_shot_gpt_answers = []
for item in tqdm(questions):
    few_shot_prompt_messages = build_few_shot_prompt(PROMPT, item, few_shot_prompts)
    answer = get_response(few_shot_prompt_messages, model_name= "gpt-3.5-turbo", temperature = 0.0, max_tokens = 10)
    few_shot_gpt_answers.append(answer)

In [None]:
few_shot_gpt_predictions = [parse_answer(x) for x in few_shot_gpt_answers]

In [None]:
print(calculate_accuracy(ground_truth, few_shot_gpt_predictions))

## CoT Prompting GPT-3.5

In [None]:
cot_gpt_answers = []
for item in tqdm(questions):
    cot_prompt = build_cot_prompt(COT_INSTRUCTION, item, COT_EXAMPLES)
    answer = get_response(cot_prompt, model_name= "gpt-3.5-turbo", temperature = 0.0, max_tokens = 100)
    cot_gpt_answers.append(answer)

In [None]:
cot_gpt_predictions = [parse_answer_cot(x) for x in cot_gpt_answers]

In [None]:
print(calculate_accuracy(ground_truth, cot_gpt_predictions))

## Dump all outputs and results

In [None]:
zero_shot_gpt_df = pd.DataFrame([[x,y] for x,y in zip(zero_shot_gpt_answers, zero_shot_gpt_predictions)])
zero_shot_gpt_df.columns = ["Generated Answer", "Extracted Choice"]
zero_shot_gpt_df.to_csv("gpt_3.5_zero_shot_predictions.csv", index=False)

In [None]:
few_shot_gpt_df = pd.DataFrame([[x,y] for x,y in zip(few_shot_gpt_answers, few_shot_gpt_predictions)])
few_shot_gpt_df.columns = ["Generated Answer", "Extracted Choice"]
few_shot_gpt_df.to_csv("gpt_3.5_few_shot_predictions.csv", index=False)

In [None]:
cot_gpt_df = pd.DataFrame([[x,y] for x,y in zip(cot_gpt_answers, cot_gpt_predictions)])
cot_gpt_df.columns = ["Generated Answer", "Extracted Choice"]
cot_gpt_df.to_csv("gpt_3.5_cot_predictions.csv", index=False)