In [None]:

import os
import json
from dotenv import load_dotenv
from openai import OpenAI
from anthropic import Anthropic
from IPython.display import Markdown, display

In [None]:
load_dotenv(override=True)

In [None]:
openai_api_key = os.getenv('OPENAI_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")

In [None]:
request = "Please come up with a challenging, nuanced question that I can ask a number of LLMs to evaluate their intelligence. "
request += "Answer only with the question, no explanation."
messages = [{"role": "user", "content": request}]

In [None]:
openai = OpenAI()
response = openai.chat.completions.create(
    model="gpt-5-mini",
    messages=messages,
)
question = response.choices[0].message.content
print(question)


In [None]:
msg = lambda txt: { "role": "user", "content": txt }
ask = lambda prompt, model = 'gpt-4.1-mini': openai.chat.completions.create(model=model, messages=[msg(prompt)])
answer = lambda res: res.choices[0].message.content

In [None]:
competitors = []
answers = []
messages = [{"role": "user", "content": question}]

In [None]:

model_name = "gpt-5-nano"

response = openai.chat.completions.create(model=model_name, messages=messages)
answer = response.choices[0].message.content

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

In [None]:
print('Hello!')

In [None]:
print(competitors)
print(answers)

In [None]:
# Build the prompt!
together = ""
for index, answer in enumerate(answers):
    together += f"# Response from competitor {index+1}\n\n"
    together += answer + "\n\n"

# Use LLM to judge who did best!!
judge = f"""You are judging a competition between {len(competitors)} competitors.
Each model has been given this question:

{question}

Your job is to evaluate each response for clarity and strength of argument, and rank them in order of best to worst.
Respond with JSON, and only JSON, with the following format:
{{"results": ["best competitor number", "second best competitor number", "third best competitor number", ...]}}

Here are the responses from each competitor:

{together}

Now respond with the JSON with the ranked order of the competitors, nothing else. Do not include markdown formatting or code blocks."""

print(judge)

In [None]:
judge_messages = [{"role": "user", "content": judge}]

In [None]:
# Judgement time!

openai = OpenAI()
response = openai.chat.completions.create(
    model="gpt-5-mini",
    messages=judge_messages,
)
results = response.choices[0].message.content
print(results)

In [None]:
# OK let's turn this into results!

results_dict = json.loads(results)
ranks = results_dict["results"]
for index, result in enumerate(ranks):
    competitor = competitors[int(result)-1]
    print(f"Rank {index+1}: {competitor}")

In [None]:
# Takeaway with this one is for a certain task or project, you can have questions and answers from various models
# then get a model to determin which response was best, not revealing the model that was used, you can abstract like "Competitor 1" e.t.c
# this way you can actually test models to see who is best at certain tasks.

In [None]:
# You could even get each competitor to give the rankings then take the average to be super specific!

# Then bam! you have agent orchestration!