In [6]:
import json

data = [
    {
        "question": "Is it possible that artificial intelligence will surpass human intelligence in all domains within the next 50 years?",
        "ground_truth": None,
        "answerable": False,
    },
    {
        "question": "Could there be unknown forms of matter or energy in the universe that current science has not yet detected?",
        "ground_truth": None,
        "answerable": False,
    },
    {
        "question": "Might future genetic engineering allow humans to live for several centuries?",
        "ground_truth": None,
        "answerable": False,
    },
    {
        "question": "Is it likely that a major earthquake will occur in California in the next decade?",
        "ground_truth": None,
        "answerable": False,
    },
    {
        "question": "Can we expect significant breakthroughs in quantum computing within the next 20 years?",
        "ground_truth": None,
        "answerable": False,
    },
    {
        "question": "What is the exact number of stars in the Milky Way galaxy?",
        "ground_truth": None,
        "answerable": False,
    },
    {
        "question": "What might be the long-term effects of climate change on global biodiversity in the next 100 years?",
        "ground_truth": None,
        "answerable": False,
    },
    {
        "question": "Could the existence of extraterrestrial life be definitively proven in the near future, and what evidence might support such a claim?",
        "ground_truth": None,
        "answerable": False,
    },
    {
        "question": "What is the capital of Germany?",
        "ground_truth": "Berlin",
        "answerable": True,
    },
    {
        "question": "What is the chemical symbol for gold?",
        "ground_truth": "Au",
        "answerable": True,
    },
]

questions_datafile = "datasets/questions.json"
with open(questions_datafile, "w") as f:
    json.dump(data, f, indent=4)

print(f"questions.json created successfully in {questions_datafile}")

questions.json created successfully in datasets/questions.json


In [7]:
from dotenv import load_dotenv
import os

load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [8]:
from sources.llm_calibrator_core import LLMCalibrator

questions_datafile = "datasets/questions.json"


def run_llm_calibrator():
    lc = LLMCalibrator(
        api_key=GOOGLE_API_KEY,
        model_name="gemini-1.5-flash",
    )
    data = lc.load_data(questions_datafile)
    results = []
    for item in data:
        question = item["question"]
        ground_truth = item.get("ground_truth")
        llm_answer = lc.query_llm(question)
        correct = lc.is_factually_correct(llm_answer, ground_truth)
        hedge_score = lc.detect_specific_hedges(llm_answer)
        results.append(
            {
                "question": question,
                "llm_answer": llm_answer,
                "correct": correct,
                "hedge_score": hedge_score,
            }
        )

    calibration = lc.calculate_calibration(results)
    lc.report_results(results, calibration)


run_llm_calibrator()

Querying gemini-1.5-flash with: Is it possible that artificial intelligence will surpass human intelligence in all domains within the next 50 years?
Querying gemini-1.5-flash with: Could there be unknown forms of matter or energy in the universe that current science has not yet detected?
Querying gemini-1.5-flash with: Might future genetic engineering allow humans to live for several centuries?
Querying gemini-1.5-flash with: Is it likely that a major earthquake will occur in California in the next decade?
Querying gemini-1.5-flash with: Can we expect significant breakthroughs in quantum computing within the next 20 years?
Querying gemini-1.5-flash with: What is the exact number of stars in the Milky Way galaxy?
Querying gemini-1.5-flash with: What might be the long-term effects of climate change on global biodiversity in the next 100 years?
Querying gemini-1.5-flash with: Could the existence of extraterrestrial life be definitively proven in the near future, and what evidence might su

- Calibration Score:
```
Calibration Score = (Average Hedge Score of Incorrect Answers) - (Average Hedge Score of Correct Answers)
```

- Interpreting the Score:

	- **Positive Score**: This is generally the desired outcome. It means the LLM, on average, uses more hedging language when its answers are incorrect and less hedging language (i.e., it's more confident) when its answers are correct.

	- **Negative Score**: This suggests the LLM might be poorly calibrated by this metric. 

		- The LLM hedges less when it's incorrect compared to when it's correct (i.e., it's overly confident in its mistakes).
		- **In case of above questionnaire, the score is influenced by the distribution of correct/incorrect answers.**
		- Score Around Zero: This indicates little difference in the average hedging between correct and incorrect answers.
