In [None]:
import openai
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage
import json

In [None]:
# Initialize the model
llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0,
    max_tokens=3,
    logprobs=True,
    top_logprobs=5
)

# Sample crossword clues (you can replace these with actual Boatload Puzzles clues)
crossword_clues = [
    {"clue": "Capital of France", "letters": 5, "answer": "PARIS"},
    {"clue": "Large body of water", "letters": 5, "answer": "OCEAN"},
    {"clue": "Man's best friend", "letters": 3, "answer": "DOG"},
    {"clue": "Yellow fruit", "letters": 6, "answer": "BANANA"},
    {"clue": "Opposite of hot", "letters": 4, "answer": "COLD"},
    {"clue": "Flying mammal", "letters": 3, "answer": "BAT"},
    {"clue": "Red planet", "letters": 4, "answer": "MARS"},
    {"clue": "Feline pet", "letters": 3, "answer": "CAT"},
    {"clue": "Frozen water", "letters": 3, "answer": "ICE"},
    {"clue": "King of the jungle", "letters": 4, "answer": "LION"}
]

In [4]:
def solve_crossword_clue(clue, letters):
    prompt = f"Solve this crossword and answer in one word. The clue is '{clue}' and it is a {letters} letter word. The answer is:"
    
    try:
        response = llm.invoke([HumanMessage(content=prompt)])
        
        # Extract the answer and logprobs
        answer = response.content.strip().upper()
        
        # Get logprobs if available
        logprobs_data = None
        if hasattr(response, 'response_metadata') and 'logprobs' in response.response_metadata:
            logprobs_data = response.response_metadata['logprobs']
        
        return {
            'answer': answer,
            'logprobs': logprobs_data,
            'full_response': response
        }
    except Exception as e:
        print(f"Error solving clue '{clue}': {e}")
        return None

In [5]:
def analyze_logprobs(logprobs_data):
    if not logprobs_data or 'content' not in logprobs_data:
        return "No logprobs available"
    
    analysis = []
    for token_data in logprobs_data['content']:
        if token_data and 'top_logprobs' in token_data:
            top_prob = token_data['top_logprobs'][0] if token_data['top_logprobs'] else None
            if top_prob:
                confidence = round(100 * (2.718 ** top_prob['logprob']), 2)
                analysis.append({
                    'token': top_prob['token'],
                    'logprob': top_prob['logprob'],
                    'confidence': f"{confidence}%"
                })
    
    return analysis

In [None]:
def main():
    correct_answers = 0
    total_clues = len(crossword_clues)
    results = []
    
    print("Testing GPT-4o on Crossword Clues")
    print("=" * 50)
    
    for i, clue_data in enumerate(crossword_clues, 1):
        clue = clue_data['clue']
        letters = clue_data['letters']
        expected = clue_data['answer']
        
        print(f"\n{i}. Clue: '{clue}' ({letters} letters)")
        print(f"Expected: {expected}")
        
        result = solve_crossword_clue(clue, letters)
        
        if result:
            actual = result['answer']
            is_correct = actual == expected
            
            if is_correct:
                correct_answers += 1
                status = "✓ CORRECT"
            else:
                status = "✗ WRONG"
            
            print(f"GPT-4o Answer: {actual} {status}")
            
            # Analyze logprobs
            logprobs_analysis = analyze_logprobs(result['logprobs'])
            if logprobs_analysis != "No logprobs available":
                print("Confidence Analysis:")
                for token_info in logprobs_analysis:
                    print(f"  Token: '{token_info['token']}' - Confidence: {token_info['confidence']} (logprob: {token_info['logprob']:.3f})")
            
            results.append({
                'clue': clue,
                'expected': expected,
                'actual': actual,
                'correct': is_correct,
                'logprobs_analysis': logprobs_analysis
            })
        else:
            print("Failed to get response")
            results.append({
                'clue': clue,
                'expected': expected,
                'actual': None,
                'correct': False,
                'logprobs_analysis': None
            })
    
    # Summary
    print("\n" + "=" * 50)
    print("SUMMARY")
    print("=" * 50)
    print(f"Total clues: {total_clues}")
    print(f"Correct answers: {correct_answers}")
    print(f"Accuracy: {(correct_answers/total_clues)*100:.1f}%")
    
    # Analyze dangerous cases (low confidence but correct, or high confidence but wrong)
    print("\nDANGEROUS CASES ANALYSIS:")
    dangerous_cases = []
    
    for result in results:
        if result['logprobs_analysis'] and result['logprobs_analysis'] != "No logprobs available":
            # Get average confidence
            confidences = []
            for token_info in result['logprobs_analysis']:
                conf_str = token_info['confidence'].replace('%', '')
                confidences.append(float(conf_str))
            
            if confidences:
                avg_confidence = sum(confidences) / len(confidences)
                
                # Dangerous case 1: Low confidence but correct
                if result['correct'] and avg_confidence < 50:
                    dangerous_cases.append(f"Low confidence correct: '{result['clue']}' - {avg_confidence:.1f}% confidence")
                
                # Dangerous case 2: High confidence but wrong
                elif not result['correct'] and avg_confidence > 80:
                    dangerous_cases.append(f"High confidence wrong: '{result['clue']}' - {avg_confidence:.1f}% confidence")
    
    if dangerous_cases:
        for case in dangerous_cases:
            print(f"- {case}")
    else:
        print("No particularly dangerous cases detected.")

if __name__ == "__main__":
    # Make sure to set your OpenAI API key
    # os.environ["OPENAI_API_KEY"] = "your-api-key-here"
    main()