Investigating Bias in LLM Self-Evaluation
=========================================

Appendix
--------

### Code

#### Dependencies

In [1]:
# !pip install matplotlib==3.10.0
# !pip install numpy==2.2.3
# !pip install pandas==2.2.3
# !pip install requests==2.32.3

In [2]:
import collections as coll
import collections.abc as collabc
import functools
import json
import os
import os.path
import typing
import urllib.parse

import requests

#### API keys

In [3]:
api_keys_filename = "api-keys.json"

if not os.path.isfile(api_keys_filename):
    raise RuntimeError(f"API keys file not found: {api_keys_filename!r}")

with open(api_keys_filename, "r") as f:
    api_keys = json.load(f)


print("API keys: " + ", ".join(sorted(api_keys.keys())))

API keys: anthropic, deepseek, google, openai, perplexity


#### Common Utilities

In [4]:
MAX_OUT_TOKENS = 32768
MAX_REASONING_TOKENS = 8192
TEMPERATURE = 0.3


def query_all(
        experiment_name: str,
        system_prompt: str,
        user_prompt: str,
        temperature: float=TEMPERATURE,
        max_out_tokens: int=MAX_OUT_TOKENS,
        reasoning_tokens: int=MAX_REASONING_TOKENS,
):
    models = {
        "sonnet": query_claude_sonnet,
        "deepseek": query_deepseek,
        "gemini": query_gemini,
        "gpt4": query_gpt4,
        "perplexity": query_perplexity,
        #"o3mini": query_o3mini,
    }

    for model_name, query_model in models.items():
        response = query_model(
            experiment_name,
            system_prompt,
            user_prompt,
            temperature,
            max_out_tokens,
            reasoning_tokens,
        )

        yield model_name, response


def send_request(
        cache_filename: str,
        url: str,
        request_headers: collabc.Mapping,
        request_body: collabc.Mapping,
        sensitive_headers: collabc.Container=(),
        sensitive_body_fields: collabc.Container=(),
):
    sensitive_headers = {h.lower() for h in sensitive_headers}
    sensitive_body_fields = {f.lower() for f in sensitive_body_fields}

    cache_dir = os.path.dirname(cache_filename)

    if not os.path.isdir(cache_dir):
        os.makedirs(cache_dir)
    
    if os.path.isfile(cache_filename):
        with open(cache_filename, "r") as f:
            return json.load(f)

    try:
        response = requests.post(url, headers=request_headers, json=request_body)
        response.raise_for_status()

        result = {
            "request": {
                "headers": del_items(request_headers, sensitive_headers),
                "body": del_items(request_body, sensitive_body_fields),
            },
            "response": {
                "headders": del_items(response.headers, sensitive_headers),
                "body": del_items(response.json(), sensitive_body_fields),
            }
        }

        with open(cache_filename, "w") as f:
            json.dump(result, f, indent=2)

        return result

    except Exception as exc:
        print(f"Exception: ({type(exc)}) {exc}")

        if hasattr(exc, "response") and exc.response is not None:
            print(f"Response status code: {exc.response.status_code}")
            print(f"Response body: {exc.response.text}")

        raise


def build_cache_filename(experiment_name: str, model_name: str, temperature: float):
    return os.path.join(
        "cache",
        (f"{experiment_name}-{model_name}-t{temperature:.3f}".replace(".", "_")) + ".json",
    )


def get_item(container, path: str, default=None):
    if path == "." or path == "":
        return container

    path = path.split(".")

    for key in path:
        if isinstance(container, collabc.Mapping):
            if key in container:
                container = container[key]
            else:
                return default
        elif isinstance(container, collabc.Sequence):
            if int(key) < len(container):
                container = container[int(key)]
            else:
                return default
        else:
            return default

    return container


def del_items(container, patterns: typing.List[str]):
    def should_include(path: list, exclude_patterns: typing.List[tuple]) -> bool:
        return not any(path_matches_pattern(path, ptrn) for ptrn in exclude_patterns)

    def copy_recursive(obj, path: list, exclude_patterns: typing.List[tuple]):
        if isinstance(obj, str):
            return obj

        if isinstance(obj, collabc.Mapping):
            copy = {}

            for k, v in obj.items():
                path_ext = path + [k]

                if should_include(path_ext, exclude_patterns):
                    copy[k] = copy_recursive(v, path_ext, exclude_patterns)

            return copy

        if isinstance(obj, collabc.Sequence):
            copy = []

            for k, v in enumerate(obj):
                path_ext = path + [str(k)]

                if should_include(path_ext, exclude_patterns):
                    copy.append(copy_recursive(v, path_ext, exclude_patterns))

            return copy

        return obj

    for pattern in patterns:
        if pattern == "." or pattern == "":
            return ValueError(f"Invalid pattern; {pattern=!r}")

    patterns = [tuple(pattern.lower().split(".")) for pattern in patterns]
    
    return copy_recursive(container, [], patterns)


def path_matches_pattern(path: collabc.Sequence, pattern: collabc.Sequence) -> bool:
    if len(path) != len(pattern):
        return False

    for path_component, pattern_component in zip(path, pattern):
        matches = (
            pattern_component == "*"
            or pattern_component == path_component.lower()
        )

        if not matches:
            return False

    return True


def test_get_item():
    container = {"aaa": [{"bbb": "42", "ccc": "123"}]}

    assert_eq("42", get_item(container, "aaa.0.bbb"))
    assert_eq(None, get_item(container, "aaa.2.zzz"))


def test_del_item():
    container = {"aaa": [{"bbb": "42", "ccc": "123", "ddd": "hello"}]}

    assert_eq({"aaa": [{"ddd": "hello"}]}, del_items(container, ["aaa.*.ccc", "*.*.bbb", "zzz"]))


def assert_eq(a, b):
    assert a == b, f"Failed to assert that a = b; {a=!r}, {b=!r}"


test_get_item()
test_del_item()

#### Anthropic Claude Client

In [5]:
def query_claude_sonnet(
        experiment_name: str,
        system_prompt: str,
        user_prompt: str,
        temperature: float=TEMPERATURE,
        max_out_tokens: int=MAX_OUT_TOKENS,
        reasoning_tokens: int=MAX_REASONING_TOKENS,
):
    # https://docs.anthropic.com/en/api/messages

    model_name = "claude-3-7-sonnet-20250219"
    temperature = 1  # Thinking requires temperature to be 1.
    cache_filename = build_cache_filename(experiment_name, model_name, temperature)
    request_headers = {
        "x-api-key": api_keys["anthropic"],
        "anthropic-version": "2023-06-01",
        "content-type": "application/json"
    }
    request_body = {
        "model": model_name,
        "max_tokens": max_out_tokens,
        "temperature": temperature,
        "stream": False,
        "system": system_prompt,
        "thinking": {
            "type": "enabled",
            "budget_tokens": reasoning_tokens,
        },
        "messages": [
            {"role": "user", "content": prompt}
        ]
    }
    result = send_request(
        cache_filename,
        "https://api.anthropic.com/v1/messages",
        request_headers,
        request_body,
        sensitive_headers=["x-api-key", "anthropic-organization-id", "request-id", "CF-RAY"],
        sensitive_body_fields=["id"],
    )

    for content in get_item(result, "response.body.content"):
        if get_item(content, "type") == "text":
            return content["text"]

    return None

#### DeepSeek Client

In [6]:
def query_deepseek(
        experiment_name: str,
        system_prompt: str,
        user_prompt: str,
        temperature: float=TEMPERATURE,
        max_out_tokens: int=MAX_OUT_TOKENS,
        reasoning_tokens: int=MAX_REASONING_TOKENS,
):
    # https://api-docs.deepseek.com/api/create-chat-completion

    max_out_tokens = min(8192, max_out_tokens)
    reasoning_tokens = min(max_out_tokens // 2 + 1, reasoning_tokens)

    model_name = "deepseek-chat"
    cache_filename = build_cache_filename(experiment_name, model_name, temperature)
    request_headers = {
        "Content-Type": "application/json",
        "Authorization": "Bearer " + api_keys["deepseek"],
    }
    request_body = {
        "model": model_name,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        "max_tokens": max_out_tokens,
        "response_format": {"type": "text"},
        "stream": False,
        "temperature": temperature,
    }
    result = send_request(
        cache_filename,
        "https://api.deepseek.com/chat/completions",
        request_headers,
        request_body,
        sensitive_headers=["Authorization", "Set-Cookie", "x-ds-trace-id", "CF-RAY"],
        sensitive_body_fields=["id"],
    )

    for choice in get_item(result, "response.body.choices"):
        if get_item(choice, "message.role") == "assistant":
            return get_item(choice, "message.content")

    return None

#### Google Gemini Client

In [7]:
def query_gemini(
        experiment_name: str,
        system_prompt: str,
        user_prompt: str,
        temperature: float=TEMPERATURE,
        max_out_tokens: int=MAX_OUT_TOKENS,
        reasoning_tokens: int=MAX_REASONING_TOKENS,
        system_prompt_key: str="systemInstruction",
):
    # https://ai.google.dev/gemini-api/docs/text-generation
    # https://ai.google.dev/api/generate-content#method:-models.generatecontent

    model_name = "gemini-2.5-pro-exp-03-25"
    cache_filename = build_cache_filename(experiment_name, model_name, temperature)
    request_headers = {
        "Content-Type": "application/json",
    }
    request_body = {
        system_prompt_key: {
            "parts": [{"text": system_prompt}],
        },
        "contents": [
            {"parts": [{"text": user_prompt}]},
        ],
        "generationConfig": {
            "temperature": temperature,
            "maxOutputTokens": max_out_tokens,
            "responseModalities": ["text"],
            "thinkingConfig": {
                "includeThoughts": True,
                "thinkingBudget": reasoning_tokens,
            },
        },
    }
    url = "".join(
        (
            "https://generativelanguage.googleapis.com/v1beta/models/",
            urllib.parse.quote_plus(model_name),
            ":generateContent?key=",
            urllib.parse.quote_plus(api_keys["google"]),
        )
    )
    result = send_request(
        cache_filename,
        url,
        request_headers,
        request_body,
        sensitive_headers=[],
        sensitive_body_fields=[],
    )

    for candidate in get_item(result, "response.body.candidates"):
        if get_item(candidate["content"], "role") == "model":
            for part in get_item(candidate, "content.parts"):
                text = get_item(part, "text")

                if text is not None and not get_item(part, "thought"):
                    return text

As of May, 2025, some of the API documentation of Gemini uses
[snake_case](https://ai.google.dev/gemini-api/docs/text-generation#system-instructions)
for the system prompt field, other parts of the documentation use
[camelCase](https://ai.google.dev/api/generate-content#method:-models.generatecontent).
The code below attempts to use both in order to see if any or both
can be accepted by the API.

In [8]:
print("# system_instruction:")
print(
    query_gemini(
        'pirate-snake_case',
        "Talk like a pirate.",
        "Explain in one brief sentence why the sky is blue.",
        system_prompt_key="system_instruction",
    )
)
print("")
print("# systemInstruction:")
print(
    query_gemini(
        'pirate-camelCase',
        "Talk like a pirate.",
        "Explain in one brief sentence why the sky is blue.",
        system_prompt_key="systemInstruction",
    )
)

# system_instruction:
Aye, the wee bits o' air scatter the blue sunlight about more than the red, makin' the heavens look that fine azure color!

# systemInstruction:
Arrr, the wee bits o' air scatter the blue sunlight 'round more than the other colors, makin' the heavens look that fine shade!


#### OpenAI Client

In [9]:
def query_openai(
        model_name: str,
        accepts_temperature: bool,
        experiment_name: str,
        system_prompt: str,
        user_prompt: str,
        temperature: float=TEMPERATURE,
        max_out_tokens: int=MAX_OUT_TOKENS,
        reasoning_tokens: int=MAX_REASONING_TOKENS,
):
    # https://platform.openai.com/docs/guides/text?api-mode=responses
    # https://platform.openai.com/docs/api-reference/responses/create

    cache_filename = build_cache_filename(experiment_name, model_name, temperature)
    request_headers = {
        "Content-Type": "application/json",
        "Authorization": "Bearer " + api_keys["openai"],
    }
    request_body = {
        "model": model_name,
        "max_output_tokens": max_out_tokens,
        "input": [
            {"role": "developer", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        "stream": False,
    }

    if accepts_temperature:
        request_body["temperature"] = temperature
    
    result = send_request(
        cache_filename,
        "https://api.openai.com/v1/responses",
        request_headers,
        request_body,
        sensitive_headers=["Authorization", "openai-organization", "x-request-id", "Set-Cookie", "CF-RAY"],
        sensitive_body_fields=["id", "output.*.id"],
    )

    for output in get_item(result, "response.body.output"):
        if get_item(output, "type") == "message" and get_item(output, "role") == "assistant":
            for content in get_item(output, "content", []):
                if get_item(content, "type") == "output_text":
                    return get_item(content, "text")


query_gpt4 = functools.partial(query_openai, "gpt-4.1-2025-04-14", True)
query_o3mini = functools.partial(query_openai, "o3-mini-2025-01-31", False)

#### Perplexity Client

In [10]:
def query_perplexity(
        experiment_name: str,
        system_prompt: str,
        user_prompt: str,
        temperature: float=TEMPERATURE,
        max_out_tokens: int=MAX_OUT_TOKENS,
        reasoning_tokens: int=MAX_REASONING_TOKENS,
):
    # https://docs.perplexity.ai/guides/getting-started
    # https://docs.perplexity.ai/api-reference/chat-completions

    model_name = "sonar-reasoning-pro"
    cache_filename = build_cache_filename(experiment_name, model_name, temperature)
    request_headers = {
        "accept": "application/json",
        "content-type": "application/json",
        "Authorization": "Bearer " + api_keys["perplexity"],
    }
    request_body = {
        "model": model_name,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        "max_tokens": max_out_tokens,
        "temperature": temperature,
        "return_related_questions": False,
        "stream": False,
        "web_search_options": {
            "search_context_size": "low",
        },
    }
    result = send_request(
        cache_filename,
        "https://api.perplexity.ai/chat/completions",
        request_headers,
        request_body,
        sensitive_headers=["Authorization", "Set-Cookie", "CF-RAY", ],
        sensitive_body_fields=["id", ],
    )

    for choice in get_item(result, "response.body.choices"):
        if get_item(choice, "message.role") == "assistant":
            return get_item(choice, "message.content")

    return None