In [2]:
import pandas as pd
from pathlib import Path
from anthropic import Anthropic
import httpx
import random

# Initialize Anthropic client with custom httpx client to handle SSL issues
http_client = httpx.Client(verify=False)
client = Anthropic(
    api_key="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",
    http_client=http_client
)

# Load base questions using fastparquet engine
base_questions_file = Path('../data/baseQuestions.parquet')
df_base = pd.read_parquet(base_questions_file, engine='fastparquet')
base_questions = df_base['question'].tolist()

print(f"Loaded {len(base_questions)} base questions")

# Generate hallucination-inducing questions
hallucination_questions = []
target_count = 50

# Sample base questions to work with
sample_size = min(10, len(base_questions))
sampled_questions = random.sample(base_questions, sample_size)

for i, base_question in enumerate(sampled_questions):
    # Generate multiple variations per base question
    questions_per_base = (target_count // sample_size) + (1 if i < (target_count % sample_size) else 0)
    
    prompt = f"""Based on this question: "{base_question}"

Generate {questions_per_base} new questions designed to cause LLM hallucinations (not just incorrect answers, but actual hallucinations involving confabulation of plausible-sounding but false details).

Use these techniques:
- Replace entities with plausible but potentially non-existent names
- Add specific numerical details (dates, percentages, amounts, limits)
- Combine real concepts with fabricated specifics
- Include precise policy/feature details that require exact knowledge
- Use obscure or ambiguous entity names that sound legitimate

Return ONLY the questions, one per line, no numbering or extra text."""

    response = client.messages.create(
        model="claude-sonnet-4-5-20250929",
        max_tokens=2000,
        messages=[{"role": "user", "content": prompt}]
    )
    
    # Extract questions from response
    generated = response.content[0].text.strip().split('\n')
    generated = [q.strip() for q in generated if q.strip() and not q.strip()[0].isdigit()]
    
    hallucination_questions.extend(generated[:questions_per_base])
    print(f"Generated {len(generated[:questions_per_base])} questions from base question {i+1}/{len(sampled_questions)}")

# Ensure we have exactly 50 questions
hallucination_questions = hallucination_questions[:target_count]

# Create DataFrame and save
df_hallucination = pd.DataFrame({'question': hallucination_questions})
output_file = Path('../data/hallucinationQuestions.parquet')
df_hallucination.to_parquet(output_file, engine='fastparquet', index=False)

print(f"\nTotal hallucination questions generated: {len(hallucination_questions)}")
print(f"Saved to: {output_file}")

Loaded 255 base questions
Generated 5 questions from base question 1/10
Generated 5 questions from base question 2/10
Generated 5 questions from base question 3/10
Generated 5 questions from base question 4/10
Generated 5 questions from base question 5/10
Generated 5 questions from base question 6/10
Generated 5 questions from base question 7/10
Generated 5 questions from base question 8/10
Generated 5 questions from base question 9/10
Generated 5 questions from base question 10/10

Total hallucination questions generated: 50
Saved to: ../data/hallucinationQuestions.parquet


In [None]:
# Test API connection
import os
api_key="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"
if not api_key:
    print("❌ ANTHROPIC_API_KEY not found in environment")
elif api_key.startswith("sk-ant-"):
    print(f"✓ API key found: {api_key[:15]}...")
else:
    print("⚠️  API key found but doesn't match expected format")

# Test network connectivity
import httpx
try:
    response = httpx.get("https://api.anthropic.com", timeout=10.0)
    print(f"✓ Can reach Anthropic API (status: {response.status_code})")
except Exception as e:
    print(f"❌ Cannot reach Anthropic API: {e}")