##### Copyright 2025 Google LLC.

Licensed under the Apache 2.0 License.

In [None]:
# @title Licensed under the Apache 2.0 License (the "License"); { display-mode: "form" }
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## PH-LLM Professional Exam Evaluation

This notebook provides the MCQ evaluation pipeline with dummy data and models.



In [None]:
# @title Import
import ast
import collections
import copy
import re
import time
from typing import Any, Callable, List, Optional, Protocol, Set, Tuple
import numpy as np
import pandas as pd

In [None]:
# @title MCQ Prompts & Constants

VALID_ANSWER_OPTIONS = [f'({letter})' for letter in 'ABCDEF']

MCQS_EVAL_INSTRUCTION = (
    'Instruction: The following is a multiple choice question about {domain}'
    ' knowledge. Output a single option from {options} as the final answer.\n'
)

VALID_HEALTH_DOMAINS = ['Sleep', 'Fitness']

SLEEP_COT_MCQ = (
    'Instructions: The following are multiple choice questions about {domain}'
    ' knowledge. Solve them in a step-by-step fashion, starting by summarizing'
    ' the available information. Output a single option from {mcq_options} as'
    ' the final answer and enclosed by xml tags <answer></answer>.\n\n Here are'
    ' two examples:\n ##Question: A 26-year-old female presents asking about'
    ' jet lag. She has no past medical history, lives on the East Coast, and'
    " travels frequently to the West Coast for business. The person's career"
    ' involves planning evening events, and she reports significant sleepiness'
    ' at these events that impairs her ability to perform her job. She wants to'
    ' know how she can adapt to Pacific Standard Time (PST) before she travels.'
    ' What treatment plan will help this patient adapt to PST prior to'
    ' travel?\n(A) Light in evening and later bedtime 1 day before'
    ' traveling\n(B) Light in morning and earlier wake time 3 days before'
    ' traveling\n(C) Light in evening and later bedtime 3 days before'
    ' traveling\n(D) Light in morning and earlier wake time 1 month before'
    ' traveling\n(E) Light in evening and later bedtime 1 month before'
    " traveling\nExplanation: Let's solve this step-by-step, referring to"
    ' authoritative sources as needed. The West Coast is 3 timezones behind the'
    ' East Coast. Since she plans evening events, she needs to shift her'
    ' schedule to stay up 3 hours later. Adding light in the evening will'
    ' disrupt melatonin production, delaying sleepiness. Transitioning'
    ' timezones typically takes one day per timezone.\nAnswer:'
    ' <answer>(C)</answer>\n\n##Question: What is a difference in the clinical'
    ' features of obstructive sleep apnea (OSA) in older adults compared to'
    ' younger adults?\n(A) Increased prevalence of OSA among older adults'
    ' occurs after age 65.\n(B) Clinical symptoms associated with OSA (e.g.'
    ' excessive daytime sleepiness) are less common and less severe in older'
    ' adults than in younger adults.\n(C) Increased risk of cardiopulmonary'
    ' diseases is greater among elderly than among younger individuals.\n(D)'
    ' Excess body weight, snoring, and witnessed apneas more consistently'
    ' indicate OSA in older adults than in younger individuals.\n(E) There are'
    ' no significant OSA differences between older and younger'
    " adults.\nExplanation: Let's solve this step-by-step, referring to"
    ' authoritative sources as needed. Compared to younger patients with the'
    ' same apnea hypopnea index, OSA in older patients is associated with less'
    ' sleepiness (Morrell et al 2012). This observation has led some to suggest'
    ' that OSA in the elderly may represent a distinct physiological'
    ' phenotype.\nAnswer: <answer>(B)</answer>\n\n ##Question:'
    ' {mcq_question}\nExplanation: Let us solve this step-by-step, referring to'
    ' authoritative sources as needed. '
)


FITNESS_COT_MCQ = (
    'Instructions: The following are multiple choice questions about fitness'
    ' knowledge. Solve them in a step-by-step fashion, starting by summarizing'
    ' the available information. Output a single option from {mcq_options} as'
    ' the final answer and enclosed by xml tags <answer></answer>.\n\n Here are'
    ' two examples: \n ##Question: A 30-year-old male is looking for a workout'
    ' plan to improve his cardiovascular health. He has no known heart'
    ' conditions and has a sedentary lifestyle. His goal is to increase stamina'
    ' and reduce the risk of heart diseases. Which of the following workout'
    ' plans is most suitable for his goals?\n(A) High-intensity interval'
    ' training (HIIT) 5 days a week\n(B) Moderate-intensity aerobic exercises'
    ' like brisk walking 30 minutes a day, 5 days a week\n(C) Weight training'
    ' focused on major muscle groups 4 days a week\n(D) Yoga and stretching'
    ' exercises twice a week\n(E) Swimming for 60 minutes daily\nExplanation:'
    " Let's solve this step-by-step, referring to authoritative sources as"
    ' needed. For someone with a sedentary lifestyle looking to improve'
    ' cardiovascular health, it is recommended to start with moderate-intensity'
    ' aerobic exercises. This approach is effective in increasing stamina and'
    ' is less likely to cause injury.\nAnswer:'
    ' <answer>(B)</answer>\n\n##Question: What is a common mistake beginners'
    ' make when starting a strength training program?\n(A) Not including enough'
    ' rest days in their routine\n(B) Focusing only on cardio exercises\n(C)'
    ' Lifting weights that are too heavy leading to poor form\n(D) Ignoring'
    ' flexibility and balance training\n(E) Spending too much time on warm-up'
    " exercises\nExplanation: Let's solve this step-by-step, referring to"
    ' authoritative sources as needed. A common mistake for beginners in'
    ' strength training is lifting weights that are too heavy, which can lead'
    ' to poor form and increase the risk of injury.\nAnswer:'
    ' <answer>(C)</answer>\n\n ##Question: {mcq_question}\nExplanation: Let us'
    ' solve this step-by-step, referring to authoritative sources as needed. '
)


SLEEP_TAKE_STEP_BACK_MCQ = (
    'You are a {domain} expert. I want you to solve a multiple-choice question'
    ' in sleep. Here is an example of how to solve a question using an'
    ' abstraction-reasoning approach, starting by recalling relevant principles'
    ' related to the subject of each question. Then apply these principles step'
    ' by step to logically deduce the correct answer. Output a single option'
    ' from {mcq_options} as the final answer and enclosed by xml tags'
    ' <answer></answer>.\n\nHere is an example:\n##Question:\nA 26-year-old'
    ' female presents asking about jet lag. She has no past medical history,'
    ' lives on the East Coast, and travels frequently to the West Coast for'
    ' business. Her career involves planning evening events, and she reports'
    ' significant sleepiness at these events that impairs her ability to'
    ' perform her job. She wants to know how she can adapt to Pacific Standard'
    ' Time (PST) before she travels. What treatment plan will help this patient'
    ' adapt to PST prior to travel?\nOptions:\n(A) Light in evening and later'
    ' bedtime 1 day before traveling\n(B) Light in morning and earlier wake'
    ' time 3 days before traveling\n(C) Light in evening and later bedtime 3'
    ' days before traveling\n(D) Light in morning and earlier wake time 1 month'
    ' before traveling\n(E) Light in evening and later bedtime 1 month before'
    ' traveling\n\n## Principles:\nCircadian Rhythms: The human body operates'
    ' on a circadian rhythm, an internal clock that cycles roughly every 24'
    ' hours. This rhythm is influenced by external cues, especially light.'
    ' Exposure to light can shift the circadian rhythm, making a person feel'
    " more awake or sleepy.\n\nJet Lag: Jet lag occurs when a person's internal"
    ' clock is out of sync with the time zone they are in. This is common when'
    ' traveling across multiple time zones. To adjust to a new time zone, the'
    " body's circadian rhythm needs to be shifted.\n\nLight Therapy: Exposure"
    ' to light at certain times can help shift the circadian rhythm. Exposure'
    ' to light in the morning advances the circadian clock (making one wake up'
    ' earlier), while exposure in the evening delays it (making one stay awake'
    ' later).\n\n## Answer:\nUsing the principles of Circadian Rhythms, Jet'
    ' Lag, and Light Therapy, we can solve the problem as following:\nThe'
    ' patient needs to adapt from the East Coast time to the West Coast time,'
    ' which is 3 hours behind. To do this, she needs to adjust her body clock'
    ' to wake up and go to sleep later according to her current (East Coast)'
    ' time zone, which aligns with the normal waking and sleeping hours in the'
    ' Pacific Standard Time.\n\nLooking at the options:\n- (A) and (C) suggest'
    ' delaying the circadian rhythm (light in the evening and later bedtime),'
    ' which would make her wake up and sleep later according to East Coast'
    ' time. However, this is counterproductive as it would exacerbate the issue'
    ' when she is on the West Coast.\n- (B) Light in the morning and earlier'
    ' wake time 3 days before traveling would advance her circadian rhythm.'
    ' This means she would wake up earlier according to East Coast time, which'
    ' is aligned with waking up at a regular time in PST.\n- (D) and (E)'
    ' propose changes starting 1 month before traveling, which is impractical'
    ' for someone who travels frequently.\n\nTherefore, the correct answer is'
    ' <answer>(B)</answer>. Light in the morning and earlier wake time 3 days'
    ' before traveling. This method would advance her circadian rhythm to'
    ' better match the Pacific Standard Time, helping her to cope with her'
    ' sleepiness during evening events on the West Coast.\n\n ## Question:\n'
    ' {mcq_question}\n\n## Principles:\n'
)


FITNESS_TAKE_STEP_BACK_MCQ = (
    'You are a fitness expert. I want you to solve a multiple-choice question'
    ' in fitness. Here is an example of how to solve a question using an'
    ' abstraction-reasoning approach, starting by recalling relevant principles'
    ' related to the subject of each question. Then apply these principles step'
    ' by step to logically deduce the correct answer. Output a single option'
    ' from {mcq_options} as the final answer and enclosed by xml tags'
    ' <answer></answer>.\n\nHere is an example: \n##Question:\nA 35-year-old'
    ' male is looking to increase muscle mass. He has been working out'
    ' consistently for a year and follows a balanced diet. He wants to know'
    ' which change in his workout routine will be most effective for gaining'
    ' muscle mass. What would you recommend?\nOptions:\n(A) Increase cardio'
    ' exercises and decrease weight lifting\n(B) Focus on high-repetition'
    ' weight lifting with lower weights\n(C) Incorporate high-intensity'
    ' interval training (HIIT) twice a week\n(D) Increase weight lifting with'
    ' heavier weights and lower repetitions\n(E) Maintain the current routine'
    ' without changes\n\n## Principles:\nMuscle Hypertrophy: Muscle growth'
    ' occurs when muscle fibers are damaged and repair themselves, leading to'
    ' an increase in muscle size. This is best achieved through resistance'
    ' training that challenges the muscles.\n\nProgressive Overload: To'
    " continue gaining muscle, it's important to progressively increase the"
    ' demands on the musculoskeletal system. This can be done by lifting'
    ' heavier weights, increasing repetitions, or changing the exercises'
    ' performed.\n\nExercise Variation: Incorporating a variety of exercises'
    ' can help target different muscle groups and prevent plateaus in muscle'
    ' growth.\n\n## Answer:\nUsing the principles of Muscle Hypertrophy,'
    ' Progressive Overload, and Exercise Variation, we can solve the problem'
    ' as following:\nThe individual is already engaged in consistent workouts'
    ' and has a balanced diet, which is fundamental for muscle growth. To'
    ' further enhance muscle mass, the focus should be on increasing the'
    ' intensity of workouts in a way that challenges the muscles more'
    ' significantly.\n\nLooking at the options:\n- (A) focuses on increasing'
    ' cardio, which is less effective for muscle hypertrophy compared to'
    ' resistance training.\n- (B) involves high-repetition lifting with lower'
    ' weights, which is more endurance-focused rather than hypertrophy.\n- (C)'
    ' HIIT can be beneficial for overall fitness but is not the most efficient'
    ' for muscle growth compared to targeted resistance training.\n- (D)'
    ' Increasing weight lifting with heavier weights and lower repetitions is'
    ' aligned with the principles of muscle hypertrophy and progressive'
    ' overload.\n- (E) Maintaining the current routine will not provide the'
    ' necessary stimulus for further muscle growth.\n\nTherefore, the correct'
    ' answer is <answer>(D)</answer>. Increasing weight lifting with heavier'
    ' weights and lower repetitions will effectively promote muscle growth by'
    ' adhering to the principles of muscle hypertrophy and progressive'
    ' overload.\n\n ## Question:\n {mcq_question}\n\n## Principles:\n'
)

SLEEP_MCQ_INPUTS_FEATURE_NAME = 'question'
SLEEP_MCQ_TARGETS_FEATURE_NAME = 'answer'
SLEEP_MCQ_DIFFICULTY_FEATURE_NAME = 'difficulty'
SLEEP_MCQ_EVAL_LABELS_FEATURE_NAME = 'choices'
SLEEP_MCQ_DIFF_LEVEL_EASY = 'Easy'
SLEEP_MCQ_DIFF_LEVEL_MODERATE = 'Moderate'
SLEEP_MCQ_DIFF_LEVEL_HARD = 'Hard'
FITNESS_MCQ_INPUTS_FEATURE_NAME = 'question'
FITNESS_MCQ_TARGETS_FEATURE_NAME = 'answer'
FITNESS_MCQ_EVAL_LABELS_FEATURE_NAME = 'choices'


def create_prompt_to_generate_mcqs(
    mcq_question: str, mcq_options: dict[str, str], mcq_domain: str
) -> str:
  """Converts a MCQ example to a prompt."""
  if not mcq_question or not mcq_options or not mcq_domain:
    raise ValueError('MCQ example is missing required fields.')
  if mcq_domain not in VALID_HEALTH_DOMAINS:
    raise ValueError(f'MCQ domain {mcq_domain} is not supported.')
  if set(mcq_options) != set(VALID_ANSWER_OPTIONS[: len(mcq_options)]):
    raise ValueError(f'MCQ options are not valid: {mcq_options}')
  instruction = MCQS_EVAL_INSTRUCTION.format(
      domain=mcq_domain, options=', '.join(sorted(mcq_options))
  )
  prompt = instruction + mcq_question
  return prompt

In [None]:
# @title Define Your Own LLM


class LanguageModel(Protocol):
  """Protocol defining the expected interface for a language model object."""

  def Generate(self, prompt: str, **kwargs) -> List[str]:
    """Generates outputs based on a given prompt.

    Args:
      prompt: The input text prompt.
      **kwargs: Additional arguments for generation (e.g., max_tokens,
        temperature).

    Returns:
      A list of generated text outputs.
    """
    ...

  def Score(self, prompt: str, completion: str, **kwargs) -> float:
    """Returns the total log probability score of a 'completion' string

    given a 'prompt' string.

    Args:
      prompt: The input text prompt (e.g., the question).
      completion: The text whose log probability is to be computed when appended
        to the prompt (e.g., the expected answer).
      **kwargs: Additional arguments for scoring.

    Returns:
      A float representing the total log probability of the completion
      given the prompt. A higher (less negative) value indicates higher
      likelihood.
    """
    ...


# --- Example Concrete Implementation of a Language Model ---
class MyExampleLanguageModel:
  """An example concrete implementation of a language model.

  In a real scenario, this would wrap your actual LM (e.g., an OpenAI API
  client, a Hugging Face model, a custom local model).
  """

  def __init__(self, model_identifier: str, num_connections: int = 1, **kwargs):
    print(
        f"Initializing MyExampleLanguageModel with: {model_identifier=},"
        f" {num_connections=}, {kwargs=}"
    )
    self._model_identifier = model_identifier
    self._num_connections = num_connections
    # In a real implementation, you would load your model here
    # e.g., self._model = load_from_huggingface(model_identifier)
    # Or self._api_client = apiclient(api_key=kwargs.get("api_key"))
    self._config = kwargs

  def Generate(
      self,
      prompt: str,
  ) -> List[str]:
    """Example implementation for text generation."""
    return f"Your prompt: {prompt} was processed."

  def Score(self, prompt: str, completion: str) -> float:
    """Example implementation for scoring text (returning log probability of completion given prompt)."""
    print(f"Scoring completion '{completion}' given prompt '{prompt}'")

    sampled_prob = np.random.rand()
    dummy_log_prob = np.log(sampled_prob)

    return dummy_log_prob


# --- Modified _get_lm_model function ---
def _get_lm_model(
    model_address: str,
    num_conn: int = 1,
    **kwargs,
) -> (
    LanguageModel
):  # Type hint that it returns an object conforming to LanguageModel
  """Get an LM model.

  This is a stub method for users to implement their own logic to load
  any language model (LM).

  Args:
    model_address: The address or identifier for the model to load.
    num_conn: The number of parallel calls to use when running the model.
    **kwargs: Additional keyword arguments that can be passed to the underlying
      model loading mechanism.

  Returns:
    An instance of the loaded language model, conforming to the LanguageModel
    interface (i.e., having Generate and Score methods).
  """
  return MyExampleLanguageModel(model_address, num_conn, **kwargs)

In [None]:
def find_majority_vote_answer(
    dict_list: List[Tuple[str, Set[str]]],
) -> Optional[Tuple[str, Set[str]]]:
  """Finds the tuple with the majority vote from a list of tuples."""
  # Extract identifiers and count them
  identifiers = []
  for entry in dict_list:
    entry_answer = entry['model_answer']
    if entry_answer not in ['(A)', '(B)', '(C)', '(D)', '(E)']:
      continue
    else:
      identifiers.append(entry_answer)
  counts = collections.Counter(identifiers)
  most_common = counts.most_common(2)
  # Check for a clear majority
  if len(most_common) == 1 or (
      len(most_common) > 1 and most_common[0][1] > most_common[1][1]
  ):
    majority_identifier = most_common[0][0]
    # Find and return the first tuple with the majority identifier
    for t in dict_list:
      if t['model_answer'] == majority_identifier:
        return (t['model_answer'], t['model_generations'])
  # If no majority or a tie, return None
  return None


def _postprocess_generation_answer(generations: set[str]) -> str:
  """Process the generated answers."""
  answer_re = re.compile(r'<answer>(\([ABCDE]\))</answer>', re.IGNORECASE)
  answers = []
  for gen in generations:
    gen = gen.strip()
    matcher = answer_re.search(gen)
    if matcher:
      answers.append(matcher.group(1).upper())
  # If no generation yielded a valid formatted Answer, flag as skipped.
  if not answers:
    return {_MODEL_GEN: generations, _SKIPPED: 1}
  # This extracts a list of most common answers within the xml tags
  # <answer></answer>, takes the first entry (in case of ties), and then
  # extracts the answer text (the second entry in the pair is the number of
  # times it appeared).
  model_answer = collections.Counter(answers).most_common(1)[0][0]
  return model_answer


def add_instruction_to_prompt(
    samples: list[dict[str, Any]], domain: str
) -> list[dict[str, Any]]:
  """Returns samples with `inputs` modified to add instruction to prompt."""
  if domain == 'Sleep':
    input_key = SLEEP_MCQ_INPUTS_FEATURE_NAME
    choices_key = SLEEP_MCQ_EVAL_LABELS_FEATURE_NAME
  elif domain == 'Fitness':
    input_key = FITNESS_MCQ_INPUTS_FEATURE_NAME
    choices_key = FITNESS_MCQ_EVAL_LABELS_FEATURE_NAME
  else:
    raise ValueError(f'Invalid domain: {domain}')

  retval = []
  for orig_sample in samples:
    sample = copy.deepcopy(orig_sample)
    sample[input_key] = create_prompt_to_generate_mcqs(
        sample[input_key], sample[choices_key], domain
    )
    retval.append(sample)
  return retval


def read_mcq_dataset(
    dataset_path: str, domain: str, difficulty_level: Optional[list[str]] = None
) -> pd.DataFrame:
  """Reads the MCQ dataset."""
  with open(dataset_path, 'r') as f:
    synthetic_mcq_dataset = pd.read_csv(f)

  synthetic_mcq_dataset['choices'] = synthetic_mcq_dataset['choices'].apply(
      ast.literal_eval
  )

  if difficulty_level:
    synthetic_mcq_dataset = synthetic_mcq_dataset[
        (synthetic_mcq_dataset['domain'] == domain)
        & (synthetic_mcq_dataset['difficulty'].isin(difficulty_level))
    ]
  else:
    synthetic_mcq_dataset = synthetic_mcq_dataset[
        synthetic_mcq_dataset['domain'] == domain
    ]
  return synthetic_mcq_dataset

In [None]:
# @title Model Evaluation


def evaluate_model(
    llm_address: str,
    dataset_path: str,
    domain: str,
    eval_func: Callable[
        [
            Any,
            str,
            dict[str, str],
            Optional[str],
            Optional[float],
            Optional[int],
            Optional[int],
        ],
        dict[str, int],
    ],
    num_examples: int = -1,
    num_replicas: int = 1,
    prompt_type: Optional[str] = None,
    temperature: float = 0.0,
    max_decoding_steps: int = 2048,
    sc_round: int = 1,
    mcq_difficulty_level: Optional[list[str]] = None,
) -> list[dict[str, Any]]:
  """Returns counts of 'correct', 'incorrect', 'skipped' questions.

  Args:
    llm_address: Path to the LLM address.
    dataset_path: Path to the dataset of MCQ example questions.
    domain: The domain of the MCQ dataset (e.g. 'sleep' or 'fitness').
    eval_func: Function used to evaluate the model specified at `llm_address`.
    num_examples: Number of examples to evaluate. If <0, evaluates all examples.
    num_replicas: Number of model replicas available. To parallelize, we need to
      both specify the number of connections to open to the server and then run
      parallel evaluations on the model.
    prompt_type: The type of prompt to use (e.g., CoT or Step-Back).
    temperature: The temperature to use for the llm model.
    max_decoding_steps: The maximum number of decoding steps to run.
    sc_round: The round of self-consistency.
    mcq_difficulty_level: The difficulty level of the MCQ dataset.

  Returns:
    A list of the examples featurized as dictionaries along with the model
    results.
  """
  start_time = time.time()
  model = _get_lm_model(llm_address, num_conn=num_replicas)
  if domain == 'Sleep':
    feature_dicts = read_mcq_dataset(
        dataset_path,
        domain=domain,
        difficulty_level=mcq_difficulty_level,
    )
  elif domain == 'Fitness':
    feature_dicts = read_mcq_dataset(
        dataset_path, domain=domain, difficulty_level='None'
    )
  else:
    raise ValueError(f'Unknown domain: {domain}')
  if num_examples < 0:
    num_examples = len(feature_dicts)
  examples_to_evaluate = feature_dicts[:num_examples]
  def _run_one_example(feats: dict[str, Any]) -> dict[str, Any]:
    inputs = feats.copy()
    res = eval_func(
        model=model,
        domain=domain,
        features=inputs,
        prompt_type=prompt_type,
        temperature=temperature,
        max_decoding_steps=max_decoding_steps,
        sc_round=sc_round,
    )
    assert set(res.keys()).isdisjoint(set(inputs.keys()))
    res.update(inputs)
    return res

  retval = []
  for _, ex in examples_to_evaluate.iterrows():
    ex = _run_one_example(ex.to_dict())
    retval.append(ex)
  print(
      f'Evaluated {dataset_path} with {llm_address} in'
      f' {time.time() - start_time} seconds using {num_replicas} workers.',
      flush=True,
  )
  return retval


################################################################################
# Methods for evaluating MCQs.
################################################################################

# Potential outcomes from evaluating the model on the question.
_CORRECT = 'correct'
_INCORRECT = 'incorrect'
_SKIPPED = 'skipped'
_NO_MAJORITY_VOTE = {'NO MAJORITY VOTE, USED lm.Score INSTEAD'}

# The answer the model provided (if not _SKIPPED).
_MODEL_ANSWER = 'model_answer'

# Relevant only for lm.Score -- the raw logprobs of each choice.
_MODEL_SCORES = 'model_scores'

# Relevant only for lm.Generate -- the generated text.
_MODEL_GEN = 'model_generations'


def eval_score(
    *,
    model: Any,
    domain: str,
    features: dict[str, Any],
    prompt_type: Optional[str] = None,
    temperature: float = 0.0,
    max_decoding_steps: int = 5,
    sc_round=None,
) -> dict[str, Any]:
  """Returns correct/incorrect for the question when evaluated with lm.Score."""
  del sc_round  # unused.
  del prompt_type  # unused.
  del temperature  # unused.
  del max_decoding_steps  # unused.
  full_question = add_instruction_to_prompt([features], domain=domain)[0][
      'question'
  ]

  # Run lm.Score for the question.
  scores = []
  for ao in features['choices']:
    scores.extend(model.Score(full_question, ao))
  model_answer = list(features['choices'].keys())[np.argmax(scores)]
  return {
      _MODEL_ANSWER: model_answer,
      _MODEL_SCORES: scores,
      _CORRECT if model_answer == features['answer'] else _INCORRECT: 1,
  }


def _create_mcq_generate_prompt(
    mcq_question: str,
    mcq_options: dict[str, str],
    prompt_type: str,
    domain: str,
) -> str:
  """Converts a sleep MCQ question to a generate prompt."""
  if prompt_type == 'step_back' and domain == 'Sleep':
    return SLEEP_TAKE_STEP_BACK_MCQ.format(
        mcq_options=', '.join(sorted(mcq_options)),
        mcq_question=mcq_question.strip(),
        domain=domain,
    )
  elif prompt_type == 'cot' and domain == 'Sleep':
    return SLEEP_COT_MCQ.format(
        mcq_options=', '.join(sorted(mcq_options)),
        mcq_question=mcq_question.strip(),
        domain=domain,
    )
  elif prompt_type == 'cot' and domain == 'Fitness':
    return FITNESS_COT_MCQ.format(
        mcq_options=', '.join(sorted(mcq_options)),
        mcq_question=mcq_question.strip(),
        domain=domain,
    )
  elif prompt_type == 'step_back' and domain == 'Fitness':
    return FITNESS_TAKE_STEP_BACK_MCQ.format(
        mcq_options=', '.join(sorted(mcq_options)),
        mcq_question=mcq_question.strip(),
        domain=domain,
    )
  else:
    raise ValueError(
        f'Unsupported combination of prompt type and domain: {prompt_type} and'
        f' {domain=}.'
    )


def eval_generate(
    *,
    model: Any,
    domain: str,
    features: dict[str, str],
    prompt_type: Optional[str] = None,
    temperature: float = 0.0,
    max_decoding_steps: int = 2048,
    sc_round=None,
) -> dict[str, Any]:
  """Returns correct/incorrect for the question when evaluated with lm.Generate."""
  del sc_round  # unused.
  full_question = _create_mcq_generate_prompt(
      features['question'],
      features['choices'],
      prompt_type,
      domain,
  )
  generations = {gen for gen in model.Generate(full_question)}
  model_answer = _postprocess_generation_answer(generations)
  return {
      _MODEL_ANSWER: model_answer,
      _CORRECT if model_answer == features['answer'] else _INCORRECT: 1,
      _MODEL_GEN: generations,
  }


def eval_generate_sc(
    *,
    model: Any,
    domain: str,
    features: dict[str, str],
    prompt_type: Optional[str] = None,
    temperature: float = 0.0,
    max_decoding_steps: int = 2048,
    sc_round: int = 5,
) -> dict[str, Any]:
  """Returns correct/incorrect for the question when evaluated with lm.Generate."""
  sc_generations = [
      eval_generate(
          model=model,
          domain=domain,
          features=features,
          prompt_type=prompt_type,
          temperature=temperature,
          max_decoding_steps=max_decoding_steps,
          sc_round=None,
      )
      for _ in range(sc_round)
  ]
  most_popular_answer = find_majority_vote_answer(sc_generations)
  if not most_popular_answer:
    retval = eval_score(
        model=model,
        domain=domain,
        features=features,
        prompt_type=None,
        temperature=temperature,
        max_decoding_steps=max_decoding_steps,
        sc_round=sc_round,
    )
    del retval[_MODEL_SCORES]
    retval[_MODEL_GEN] = _NO_MAJORITY_VOTE
    return retval
  else:
    model_answer, generations = most_popular_answer
  return {
      _MODEL_ANSWER: model_answer,
      _CORRECT if model_answer == features['answer'] else _INCORRECT: 1,
      _MODEL_GEN: generations,
  }

## Evaluation

In [None]:
def _accuracy(results: list[dict[str, Any]]) -> tuple[int, int, float]:
  """Returns (correct, incorrect, accuracy) tuple."""
  correct = sum(q.get(_CORRECT, 0) for q in results)
  incorrect = sum(q.get(_INCORRECT, 0) for q in results)
  acc = np.nan if correct + incorrect == 0 else correct / (correct + incorrect)
  return correct, incorrect, acc


def analyze_results(results: list[dict[str, Any]]) -> None:
  """Prints out analysis of results, both stratified and combined."""
  stratifications = {'All': results}
  num_questions_with_difficulty = sum(int('difficulty' in q) for q in results)
  if num_questions_with_difficulty not in [0, len(results)]:
    raise ValueError(
        'Expected either all or none of the questions to be annotated with '
        f'difficulty, found {num_questions_with_difficulty}/{len(results)}.'
    )
  if num_questions_with_difficulty:
    for difficulty in {q['difficulty'] for q in results}:
      stratifications[difficulty] = [
          q for q in results if q['difficulty'] == difficulty
      ]

  for diff, strat in sorted(stratifications.items()):
    correct, incorrect, acc = _accuracy(strat)
    print(
        f'Accuracy for {diff} questions: {correct}/{correct + incorrect} ='
        f' {acc:.2f}'
    )


def save_results(results: list[dict[str, Any]], filename: str) -> None:
  """Saves results to a CSV file."""
  df_results = pd.DataFrame(results)
  with open(filename, 'w') as f:
    df_results.to_csv(f, index=True)


def perform_full_evaluation(
    *,
    llm_address: str,
    dataset_path: str,
    domain: str,
    outroot: str | None = None,
    num_examples: int = -1,
    num_replicas: int = 1,
    prompt_type: Optional[str] = None,
    use_eval_generate: bool = False,
    temperature: float = 0,
    max_decoding_steps: int = 2048,
    mcq_difficulty_level: Optional[list[str]] = None,
    sc_round: Optional[int] = None,
) -> None:
  """Performs full evaluation."""
  if outroot:
    outroot += f'.{llm_address.split("/")[-1]}'

  if not use_eval_generate:
    print('Using model.Score for evaluation.')
    score_test_results = evaluate_model(
        llm_address=llm_address,
        dataset_path=dataset_path,
        domain=domain,
        eval_func=eval_score,
        num_examples=num_examples,
        num_replicas=num_replicas,
        mcq_difficulty_level=mcq_difficulty_level,
        temperature=temperature,
    )
    print('## Results for lm.Score evaluation. ##')
    print('\n# Test data:')
    analyze_results(score_test_results)
    if outroot:
      score_test_save_path = outroot.format(split='test') + '.score.csv'
      save_results(score_test_results, score_test_save_path)
      return score_test_results
  elif use_eval_generate and sc_round:
    print('Using model.Generate with self-consistency for evaluation.')
    generate_test_results = evaluate_model(
        llm_address=llm_address,
        dataset_path=dataset_path,
        domain=domain,
        eval_func=eval_generate_sc,
        num_examples=num_examples,
        num_replicas=num_replicas,
        prompt_type=prompt_type,
        temperature=temperature,
        max_decoding_steps=max_decoding_steps,
        sc_round=sc_round,
        mcq_difficulty_level=mcq_difficulty_level,
    )
    print(f'## Results for self-consistency {prompt_type} evaluation. ##')
    print('\n# Test data:')
    analyze_results(generate_test_results)
    if outroot:
      generate_test_save_path = (
          outroot.format(split='test') + f'.{prompt_type}.sc.csv'
      )
      save_results(generate_test_results, generate_test_save_path)
    return generate_test_results
  else:
    print('Using model.Generate for evaluation.')
    generate_test_results = evaluate_model(
        llm_address=llm_address,
        dataset_path=dataset_path,
        domain=domain,
        eval_func=eval_generate,
        num_examples=num_examples,
        num_replicas=num_replicas,
        prompt_type=prompt_type,
        temperature=temperature,
        max_decoding_steps=max_decoding_steps,
        mcq_difficulty_level=mcq_difficulty_level,
    )
    print(f'## Results for lm.Generate {prompt_type} evaluation. ##')
    print('\n# Test data:')
    analyze_results(generate_test_results)
    if outroot:
      generate_test_save_path = (
          outroot.format(split='test') + f'.{prompt_type}.csv'
      )
      save_results(generate_test_results, generate_test_save_path)
    return generate_test_results

In [None]:
# @title Generate Synthetic Dummy MCQs

import json

data = [
    {
        'question': 'What is the recommended amount of sleep for adults?',
        'answer': '(C)',
        'choices': collections.OrderedDict([
            ('(A)', '4-5 hours'),
            ('(B)', '6-7 hours'),
            ('(C)', '7-9 hours'),
            ('(D)', '10-12 hours'),
        ]),
        'domain': 'Sleep',
        'difficulty': 'Easy',
    },
    {
        'question': (
            'Which stage of sleep is characterized by rapid eye movements and'
            ' vivid dreams?'
        ),
        'answer': '(D)',
        'choices': collections.OrderedDict([
            ('(A)', 'Stage 1'),
            ('(B)', 'Stage 2'),
            ('(C)', 'NREM'),
            ('(D)', 'REM'),
        ]),
        'domain': 'Sleep',
        'difficulty': 'Moderate',
    },
    {
        'question': (
            'A patient presents with excessive daytime sleepiness and loud'
            ' snoring. What is a likely diagnosis?'
        ),
        'answer': '(B)',
        'choices': collections.OrderedDict([
            ('(A)', 'Insomnia'),
            ('(B)', 'Sleep Apnea'),
            ('(C)', 'Narcolepsy'),
            ('(D)', 'Restless Legs Syndrome'),
        ]),
        'domain': 'Sleep',
        'difficulty': 'Hard',
    },
    {
        'question': (
            'What hormone is primarily responsible for regulating the'
            ' sleep-wake cycle?'
        ),
        'answer': '(A)',
        'choices': collections.OrderedDict([
            ('(A)', 'Melatonin'),
            ('(B)', 'Cortisol'),
            ('(C)', 'Insulin'),
            ('(D)', 'Adrenaline'),
        ]),
        'domain': 'Sleep',
        'difficulty': 'Easy',
    },
    {
        'question': 'Which of the following is a common symptom of insomnia?',
        'answer': '(C)',
        'choices': collections.OrderedDict([
            ('(A)', 'Loud snoring'),
            ('(B)', 'Daytime alertness'),
            ('(C)', 'Difficulty falling or staying asleep'),
            ('(D)', 'Sudden sleep attacks'),
        ]),
        'domain': 'Sleep',
        'difficulty': 'Moderate',
    },
    {
        'question': (
            'Cognitive Behavioral Therapy for Insomnia (CBT-I) typically'
            ' includes which of the following components?'
        ),
        'answer': '(E)',
        'choices': collections.OrderedDict([
            ('(A)', 'Only medication prescription'),
            ('(B)', 'Strict diet restrictions'),
            ('(C)', 'Exposure therapy to fears'),
            ('(D)', 'Biofeedback alone'),
            ('(E)', 'Sleep restriction and stimulus control'),
        ]),
        'domain': 'Sleep',
        'difficulty': 'Hard',
    },
]


df = pd.DataFrame(data)

# Convert the 'choices' OrderedDict to a standard dictionary and then to a JSON string
# json.dumps is generally safer and more standard for serializing dictionaries
# for eventual parsing, even if ast.literal_eval is used on the reading side.
df['choices'] = df['choices'].apply(lambda x: json.dumps(dict(x)))

# Save to CSV
df.to_csv('./synthetic_mcq_data.csv', index=False)

print(
    "Dummy 'synthetic_mcq_data.csv' has been created successfully with"
    ' corrected choices format.'
)

In [None]:
llm_addresses = ['dummy_llm_address']
g_num_model_replicas = 5  # @param {type:"integer"}
g_domain = 'Sleep'  # @param ['Sleep', 'Fitness']
temperature = 0.7  # @param {type:"number"}
max_decoding_steps = 2048  # @param {type:"integer"}
max_decoding_steps_score = 5
sc_round = 3  # @param {type:"integer"}
dataset_path = './synthetic_mcq_data.csv'  # @param
outroot = '/tmp/'  # @param

## CoT + Self Consistency - LM.Generate / Score

In [None]:
for llm_address in llm_addresses:
  _ = perform_full_evaluation(
      llm_address=llm_address,
      dataset_path=dataset_path,
      domain=g_domain,  # or 'Fitness'
      outroot=outroot,
      # num_examples=3, # Only used for debugging.
      num_replicas=g_num_model_replicas,
      prompt_type='cot',
      use_eval_generate=True,
      temperature=temperature,
      max_decoding_steps=max_decoding_steps,
      mcq_difficulty_level=[
          SLEEP_MCQ_DIFF_LEVEL_EASY,
          SLEEP_MCQ_DIFF_LEVEL_MODERATE,
          SLEEP_MCQ_DIFF_LEVEL_HARD,
      ],
      sc_round=None,
  )