In [1]:
import requests
import json
import statistics
import re

In [2]:
class ConfidenceAssessment:
    def __init__(self, model_name="llama3"):
        self.model_name = model_name
        self.ollama_url = "http://localhost:11434/api/generate"
    
    def query_llama(self, prompt, temperature=0.7):
        """Send query to Ollama"""
        payload = {
            "model": self.model_name,
            "prompt": prompt,
            "stream": False,
            "options": {"temperature": temperature}
        }
        response = requests.post(self.ollama_url, json=payload)
        return response.json()["response"].strip()
    
    def method1_direct_confidence(self, question):
        prompt = f"""Question: {question}
        
                Please answer the question and then rate your confidence from 0-100 (where 100 is completely certain).
                Format: Answer: [your answer]
                Confidence: [0-100]"""
        
        response = self.query_llama(prompt)
        
        # Extract confidence score
        confidence_match = re.search(r'Confidence:\s*(\d+)', response, re.IGNORECASE)
        confidence = int(confidence_match.group(1)) if confidence_match else 50
        
        return response, confidence
    
    def method2_margin_sampling(self, question, n_samples=5):
        answers = []
        temperatures = [0.1, 0.3, 0.5, 0.7, 0.9]
        
        for temp in temperatures[:n_samples]:
            prompt = f"Question: {question}\nAnswer briefly:"
            answer = self.query_llama(prompt, temperature=temp)
            answers.append(answer.lower().strip())
        
        # Calculate consistency (inverse of diversity)
        unique_answers = len(set(answers))
        consistency = ((n_samples - unique_answers + 1) / n_samples) * 100
        
        return answers[0], consistency
    
    def method3_self_consistency(self, question, n_samples=5):
        prompt = f"Question: {question}\nAnswer briefly:"
        answers = []
        
        for _ in range(n_samples):
            answer = self.query_llama(prompt, temperature=0.8)
            answers.append(answer.lower().strip())
        
        # Find most common answer and calculate confidence
        from collections import Counter
        answer_counts = Counter(answers)
        most_common_answer, count = answer_counts.most_common(1)[0]
        confidence = (count / n_samples) * 100
        
        return most_common_answer, confidence


In [6]:
def test_confidence_methods():
    
    # Test questions - mix of easy facts and potentially tricky ones
    test_questions = [
        "What is the capital of France?",
        "In what year did World War II end?",
        "What is the largest planet in our solar system?",
        "Who painted the Mona Lisa?",
        "What is the chemical symbol for gold?",
        "In what year was the Berlin Wall torn down?",
        "What is the deepest ocean trench on Earth?",
        "Who was the first person to walk on the moon?"
    ]
    
    assessor = ConfidenceAssessment()
    results = []
    
    print("Testing Confidence Assessment Methods")
    print("=" * 50)
    
    for i, question in enumerate(test_questions, 1):
        print(f"\n{i}. {question}")
        print("-" * 40)
        
        try:
            # Method 1: Direct confidence
            answer1, conf1 = assessor.method1_direct_confidence(question)
            print(answer1)
            print(f"Direct Confidence: {conf1}%")
            
            # Method 2: Margin sampling
            answer2, conf2 = assessor.method2_margin_sampling(question)
            print(answer2)
            print(f"Margin Sampling: {conf2:.1f}%")
            
            # Method 3: Self-consistency
            answer3, conf3 = assessor.method3_self_consistency(question)
            print(answer3)
            print(f"Self-Consistency: {conf3:.1f}%")
            
            results.append({
                'question': question,
                'direct_conf': conf1,
                'margin_conf': conf2,
                'self_cons_conf': conf3,
                'answers': [answer1, answer2, answer3]
            })
            
        except Exception as e:
            print(f"Error processing question: {e}")
    
    # Summary analysis
    print("\n" + "=" * 50)
    print("SUMMARY ANALYSIS")
    print("=" * 50)
    
    if results:
        direct_scores = [r['direct_conf'] for r in results]
        margin_scores = [r['margin_conf'] for r in results]
        self_cons_scores = [r['self_cons_conf'] for r in results]
        
        print(f"Average Confidence Scores:")
        print(f"Direct Confidence: {statistics.mean(direct_scores):.1f}%")
        print(f"Margin Sampling: {statistics.mean(margin_scores):.1f}%")
        print(f"Self-Consistency: {statistics.mean(self_cons_scores):.1f}%")
        
        print(f"\nStandard Deviation:")
        print(f"Direct Confidence: {statistics.stdev(direct_scores):.1f}")
        print(f"Margin Sampling: {statistics.stdev(margin_scores):.1f}")
        print(f"Self-Consistency: {statistics.stdev(self_cons_scores):.1f}")
        
        print("\nObservations:")
        print("- Lower std dev suggests more calibrated confidence")
        print("- Compare how methods handle easy vs hard questions")
        print("- Self-consistency often most reliable for factual questions")

In [7]:
if __name__ == "__main__":
    test_confidence_methods()

Testing Confidence Assessment Methods

1. What is the capital of France?
----------------------------------------
Answer: Paris

Confidence: 100
Direct Confidence: 100%
paris.
Margin Sampling: 100.0%
paris.
Self-Consistency: 80.0%

2. In what year did World War II end?
----------------------------------------
Answer: 1945

Confidence: 100
Direct Confidence: 100%
world war ii ended in 1945.
Margin Sampling: 60.0%
world war ii ended on september 2, 1945.
Self-Consistency: 40.0%

3. What is the largest planet in our solar system?
----------------------------------------
Answer: Jupiter

Confidence: 100
Direct Confidence: 100%
jupiter.
Margin Sampling: 80.0%
jupiter.
Self-Consistency: 40.0%

4. Who painted the Mona Lisa?
----------------------------------------
Answer: Leonardo da Vinci

Confidence: 99
Direct Confidence: 99%
leonardo da vinci.
Margin Sampling: 100.0%
leonardo da vinci.
Self-Consistency: 100.0%

5. What is the chemical symbol for gold?
--------------------------------------