In [1]:
import json
import os
from dotenv import load_dotenv
from openai import OpenAI

In [2]:
load_dotenv(override=True)

True

In [3]:
openai_api_key = os.getenv("OPENAI_API_KEY")
model = "gpt-5-nano"

In [5]:
openai = OpenAI()

In [4]:
request = "Come up with a challenging, nuanced question that I can ask a number of LLMs to evaluate their intelligence."
request += "Respond only with the question, no explanation."

In [12]:
messages = [{"role": "user", "content": request}]

response = openai.chat.completions.create(
    model=model,
    messages=messages
)

In [13]:
question = response.choices[0].message.content

In [14]:
question

'Design a policy framework to allocate scarce global health resources (e.g., vaccines, antivirals, ICU capacity) during a novel pandemic under deep uncertainty and diverse, uneven data quality. Your answer should include: (1) a clear objective, constraints, and trade-offs you are willing to accept; (2) a scalable governance and decision-rule design (data inputs, scoring/allocation rules, accountability); (3) an uncertainty-management plan using scenario generation, robustness criteria, and clear triggers for adaptation; (4) an equity analysis across regions, socio-economic groups, and vulnerability factors, including potential harms and mitigation strategies; (5) an operational implementation outline (data infrastructure, governance bodies, and timelines); (6) a comprehensive evaluation protocol with metrics and a validation approach (e.g., simulation, back-testing) and a plan for monitoring and iteration; and (7) a concise justification of your key assumptions, expected outcomes, and 

In [15]:
competitors = []
answers = []
messages = [{"role": "user", "content": question}]

In [16]:
model_name = "gpt-5-nano"
response = openai.chat.completions.create(model=model_name, messages=messages)
answer = response.choices[0].message.content
competitors.append(model_name)
answers.append(answer)

In [18]:
answer

'Policy framework for allocating scarce global health resources during a novel pandemic under deep uncertainty\n\n1) Objective, constraints, and trade-offs\n\n- Objective\n  - Primary: minimize global mortality and severe disease from the pandemic, subject to preserving essential health system functioning (including non-pandemic care) and enabling rapid, scalable response.\n  - Secondary: reduce inequities across regions, socio-economic groups, and vulnerability strata; maintain transparent, legitimate, and credible decision processes; enable adaptive roll-out that can respond to new information without catastrophic delay.\n\n- Constraints\n  - Scarcity: limited supplies of vaccines, antivirals, ICU capacity, oxygen, PPE, and critical care staff relative to demand.\n  - Data quality and timeliness: heterogeneous, incomplete, delayed, or unreliable data across regions; varying surveillance capacities.\n  - Logistics and manufacturing: production lead times, cold-chain needs, distributio

In [19]:
model_name = "gpt-4o-mini"
response = openai.chat.completions.create(model=model_name, messages=messages)
answer = response.choices[0].message.content
competitors.append(model_name)
answers.append(answer)

In [20]:
answer

"### Policy Framework for Allocating Scarce Global Health Resources During a Novel Pandemic\n\n#### 1. Objective, Constraints, and Trade-offs\n\n**Objective:**\nTo allocate scarce global health resources (vaccines, antivirals, ICU capacity) effectively and equitably during a pandemic, maximizing public health outcomes and minimizing mortality and morbidity.\n\n**Constraints:**\n- Resource scarcity: Limited availability of vaccines, therapeutics, and ICU capacity.\n- Data quality variability: Uneven reliability of data from different regions and sources.\n- Time sensitivity: Rapid decision-making needed as the pandemic evolves.\n- Legal and ethical norms: Compliance with ethical guidelines and international laws.\n\n**Acceptable Trade-offs:**\n- Prioritize high-risk populations (e.g., healthcare workers, elderly, individuals with comorbidities) over low-risk groups, even if this means short-term inequalities.\n- Allow for higher allocations in regions with faster disease spread at the e

In [21]:
model_name = "gpt-5-mini"
response = openai.chat.completions.create(model=model_name, messages=messages)
answer = response.choices[0].message.content
competitors.append(model_name)
answers.append(answer)

In [22]:
answer

'Below is a practical, ethically grounded, and operational policy framework to allocate scarce global health resources (vaccines, antivirals, ICU capacity) during a novel pandemic characterized by deep uncertainty and uneven data quality. It is designed to be scalable, adaptive, transparent, and equity-sensitive.\n\n1) Objective, constraints, and trade-offs\n- Primary objective\n  - Minimize preventable severe illness and deaths globally while preserving critical health system function and limiting social disruption, subject to fairness, feasibility, and political realities.\n- Secondary objectives\n  - Protect health workers and essential responders; reduce epidemic spread when feasible; reduce catastrophic economic and social harms in most vulnerable communities.\n- Hard constraints\n  - Finite, time-varying supply of resources; variable logistic capacity (cold chain, transport, trained staff); sovereign control over national health systems; legal and regulatory diversity; data gaps 

In [23]:
all_answers = ""

for idx, answer in enumerate(answers):
    all_answers += f"#Response from competitor {idx + 1}\n\n"
    all_answers += answer + "\n\n"

In [26]:
judge = f"""You are judging a competition between {len(competitors)} competitors.
Each model has been given this question:

{question}

Your job is to evaluate each response for clarity and strength of argument, and rank them in order of best to worst.
Respond with JSON, and only JSON, with the following format:
{{"results": ["best competitor number", "second best competitor number", "third best competitor number", ...]}}

Here are the responses from each competitor:

{all_answers}

Now respond with the JSON with the ranked order of the competitors, nothing else. Do not include markdown formatting or code blocks."""

In [28]:
judge_messages = [{"role": "user", "content": judge}]

response = openai.chat.completions.create(
    model="gpt-4o-mini",
    messages=judge_messages
)

In [38]:
rankings = json.loads(response.choices[0].message.content)["results"]

In [42]:
rankings = list(map(int, rankings))

In [46]:
rankings

[1, 3, 2]

In [49]:
for idx, model in enumerate(rankings):
    competitor = competitors[model - 1]
    
    print(f"{idx + 1}. {competitor}")

1. gpt-5-nano
2. gpt-5-mini
3. gpt-4o-mini


In [50]:
competitors

['gpt-5-nano', 'gpt-4o-mini', 'gpt-5-mini']