# LLM Model Ranking via Self-Generated Challenge Questions

This notebook orchestrates a simple model-ranking workflow:
- Ask one model to generate a challenging, nuanced question.
- Pose that question to multiple models and collect their answers.
- Compare answers to produce a rough ranking.

Notes:
- Keep prompts deterministic when possible to reduce variance.
- Be mindful of API usage and costs.
- Do not print full API keys; only partial, masked previews.

In [None]:
# Imports
# - os/json: environment and simple serialization
# - dotenv: load API keys from .env
# - openai: client for OpenAI-compatible APIs
# - display utils: richer notebook output
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
from IPython.display import Markdown, display


In [None]:
# Load environment variables from a local .env file
# override=True allows .env to replace existing env vars if needed
load_dotenv(override=True)

True

In [None]:
# Fetch API keys (masked preview only)
openai_api_key = os.getenv("OPENAI_API_KEY")
google_api_key = os.getenv("GOOGLE_API_KEY")

if openai_api_key:
    print(f"OpenAI API key found and begins {openai_api_key[:8]}")
else:
    print("OpenAI API key not found")

if google_api_key:
    print(f"Google API key found and begins {google_api_key[:2]}")
else:
    print("Google API key not found")


OpenAI API key found and begins sk-proj-
Google API key found and begins AI


In [None]:
# Prompt to generate a single challenging evaluation question
request = (
    "Please come up with a challenging, nuanced question that I can ask a number of LLMs "
    "to evaluate their intelligence. Answer only with the question, no explanation."
)
messages = [{"role": "user", "content": request}]


In [None]:
# Inspect the message payload to be sent to the model
messages

[{'role': 'user',
  'content': 'Please come up with a challenging, nuanced question that I can ask a number of LLMs to evaluate their intelligence. Answer only with the question, no explanation.'}]

In [None]:
# Call the model to generate the evaluation question
# Note: ensure your OPENAI_API_KEY is set; adjust model name as needed
openai = OpenAI
response = openai.chat.completions.create(
    model="gpt-4o-mini",
    messages=messages,
)
question = response.choices[0].message.content
print(question)


In [None]:
# Prepare containers for competitor models and their answers
competitors = []  # e.g., [("gpt-4o-mini", clientA), ("gemini-1.5-pro", clientB)]
answers = []

# Re-seed the conversation with the generated question
messages = [{'role': 'user', 'content': question}]


In [None]:
# Populate `competitors` with the models/clients you want to compare.
# The API we know well gpt-4o-mini
# Query each model with the same `messages` and store the response text in `answers`.

model_name = "gpt-4o-mini"

response = openai.chat.completions.create(model=model_name, messages=messages)
answer = response.choices[0].message.content

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)


In [None]:
# The API we know well emini-2.0-flash
# Query each model with the same `messages` and store the response text in `answers`.

gemini = OpenAI(api_key=google_api_key, base_url="https://generativelanguage.googleapis.com/v1beta/openai/")
model_name = "gemini-2.0-flash"

response = gemini.chat.completions.create(model=model_name, messages=messages)
answer = response.choices[0].message.content

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)



In [None]:
print(competitors)
print(answers)

In [None]:
# zip answers
for competitor, answer in zip(competitors, answers):
    print(f"Competitor: {competitor}\n\n{answer}")

In [None]:
# Let's bring this together - note the use of "enumerate"

together = ""
for index, answer in enumerate(answers):
    together += f"# Response from competitor {index+1}\n\n"
    together += answer + "\n\n"

print(together)


In [None]:
judge = f"""You are judging a competition between {len(competitors)} competitors.
Each model has been given this question:

{question}

Your job is to evaluate each response for clarity and strength of argument, and rank them in order of best to worst.
Respond with JSON, and only JSON, with the following format:
{{"results": ["best competitor number", "second best competitor number", "third best competitor number", ...]}}

Here are the responses from each competitor:

{together}

Now respond with the JSON with the ranked order of the competitors, nothing else. Do not include markdown formatting or code blocks."""


In [None]:
print(judge)

In [None]:
judge_messages = [{"role": "user", "content": judge}]

In [None]:
# Judgement time!

openai = OpenAI()
response = openai.chat.completions.create(
    model="gpt-4o-mini",
    messages=judge_messages,
)
results = response.choices[0].message.content
print(results)


In [None]:
# OK let's turn this into results!

result_dict = json.loads(results)
ranks = result_dict["results"]
for index, result in enumerate(ranks):
    competitor = competitors[int(result)-1]
    print (f"Rank {index+1}: {competitor}")