In [None]:
# LLM Health Advice Evaluation Analysis

**Research Question:** How factually accurate and unbiased are GPT-4's responses to common mental health and wellness questions compared to information from the CDC and WHO?

## Project Overview
- **13 health questions** spanning pregnancy, mental health, general wellness, and medical procedures
- **GPT-4 responses** generated for each question  
- **Ground truth** from verified sources (CDC, WHO, Mayo Clinic, medical communities)
- **Expert evaluations** using GPT-4 as evaluator on 4 criteria: Factual Accuracy, Clarity, Neutrality, Helpfulness

---


In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set styling
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11

print("üìä Libraries loaded successfully!")


In [None]:
# Load data
data_path = Path("../data")

with open(data_path / "questions.json") as f:
    questions_data = json.load(f)

with open(data_path / "evaluations.json") as f:
    evaluations_data = json.load(f)

print(f"‚úÖ Loaded {len(questions_data)} questions")
print(f"‚úÖ Loaded {len(evaluations_data)} evaluations")

# Display first question as example
print("\nüìã Example Question:")
print(f"ID: {questions_data[0]['id']}")
print(f"Question: {questions_data[0]['question']}")
print(f"GPT Response: {questions_data[0]['response'][:100]}...")
print(f"Ground Truth: {questions_data[0]['answer'][:100]}...")


In [None]:
def parse_evaluation_scores(evaluation_text):
    """Extract numerical scores from evaluation text"""
    scores = {}
    
    # Look for patterns like "Factual Accuracy: 4" or "Accuracy: 4"
    patterns = {
        'factual_accuracy': r'(?:Factual\s+)?Accuracy:\s*(\d+)',
        'clarity': r'Clarity:\s*(\d+)',
        'neutrality': r'Neutrality:\s*(\d+)',
        'helpfulness': r'Helpfulness:\s*(\d+)'
    }
    
    for criterion, pattern in patterns.items():
        match = re.search(pattern, evaluation_text, re.IGNORECASE)
        if match:
            scores[criterion] = int(match.group(1))
        else:
            scores[criterion] = None
    
    return scores

# Test the function with first evaluation
test_eval = evaluations_data[0]['evaluation']
print("üîç Testing score extraction:")
print(f"Evaluation text: {test_eval[:200]}...")
print(f"Extracted scores: {parse_evaluation_scores(test_eval)}")


In [None]:
# Create comprehensive analysis dataframe
analysis_data = []

for eval_item in evaluations_data:
    # Parse evaluation scores
    scores = parse_evaluation_scores(eval_item['evaluation'])
    
    # Categorize question type
    question = eval_item['question'].lower()
    if any(word in question for word in ['pregnant', 'pregnancy', 'birth', 'c section', 'uterus']):
        category = 'Pregnancy/Reproductive Health'
    elif any(word in question for word in ['psychologist', 'meds', 'medication', 'antidepressant']):
        category = 'Mental Health'
    elif any(word in question for word in ['surgery', 'mri', 'iud', 'doctor', 'medical', 'procedure']):
        category = 'Medical Procedures'
    else:
        category = 'General Health'
    
    # Calculate response lengths
    gpt_response_length = len(eval_item['gpt_response'].split())
    ground_truth_length = len(eval_item['ground_truth'].split())
    
    analysis_data.append({
        'id': eval_item['id'],
        'question': eval_item['question'][:100] + '...' if len(eval_item['question']) > 100 else eval_item['question'],
        'category': category,
        'factual_accuracy': scores['factual_accuracy'],
        'clarity': scores['clarity'],
        'neutrality': scores['neutrality'],
        'helpfulness': scores['helpfulness'],
        'gpt_response_length': gpt_response_length,
        'ground_truth_length': ground_truth_length,
        'evaluation_text': eval_item['evaluation']
    })

df = pd.DataFrame(analysis_data)

# Calculate overall score
df['overall_score'] = df[['factual_accuracy', 'clarity', 'neutrality', 'helpfulness']].mean(axis=1)

print(f"üìä Created analysis dataframe with {len(df)} entries")
print(f"Categories: {df['category'].value_counts().to_dict()}")
print(f"\nüéØ Overall Statistics:")
print(df[['factual_accuracy', 'clarity', 'neutrality', 'helpfulness', 'overall_score']].describe())


In [None]:
## üìä Key Findings Visualizations


In [None]:
# 1. Overall Performance Across Criteria
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# Criteria scores distribution
criteria = ['factual_accuracy', 'clarity', 'neutrality', 'helpfulness']
mean_scores = [df[criterion].mean() for criterion in criteria]
criteria_labels = ['Factual\nAccuracy', 'Clarity', 'Neutrality', 'Helpfulness']

ax1.bar(criteria_labels, mean_scores, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
ax1.set_ylim(0, 5)
ax1.set_title('Average Scores by Criteria', fontsize=14, fontweight='bold')
ax1.set_ylabel('Score (1-5)')
for i, v in enumerate(mean_scores):
    ax1.text(i, v + 0.1, f'{v:.2f}', ha='center', fontweight='bold')

# Score distribution histogram
ax2.hist(df['overall_score'], bins=10, alpha=0.7, color='#FF6B6B', edgecolor='black')
ax2.set_title('Distribution of Overall Scores', fontsize=14, fontweight='bold')
ax2.set_xlabel('Overall Score (1-5)')
ax2.set_ylabel('Frequency')
ax2.axvline(df['overall_score'].mean(), color='red', linestyle='--', linewidth=2, 
           label=f'Mean: {df["overall_score"].mean():.2f}')
ax2.legend()

# Performance by category
category_scores = df.groupby('category')['overall_score'].mean().sort_values(ascending=True)
ax3.barh(category_scores.index, category_scores.values, color='#45B7D1')
ax3.set_title('Average Score by Question Category', fontsize=14, fontweight='bold')
ax3.set_xlabel('Average Overall Score')
for i, v in enumerate(category_scores.values):
    ax3.text(v + 0.05, i, f'{v:.2f}', va='center', fontweight='bold')

# Individual question performance
question_performance = df[['question', 'overall_score']].sort_values('overall_score')
top_5 = question_performance.tail(5)
bottom_5 = question_performance.head(5)

y_pos = range(len(top_5))
ax4.barh(y_pos, top_5['overall_score'], color='green', alpha=0.7, label='Top 5')
ax4.set_yticks(y_pos)
ax4.set_yticklabels([q[:30] + '...' for q in top_5['question']], fontsize=9)
ax4.set_title('Top 5 Performing Questions', fontsize=14, fontweight='bold')
ax4.set_xlabel('Overall Score')

plt.tight_layout()
plt.savefig('../results/figures/overall_performance.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"üéØ KEY INSIGHTS:")
print(f"‚Ä¢ Average Overall Score: {df['overall_score'].mean():.2f}/5.0")
print(f"‚Ä¢ Highest Criterion: {criteria_labels[np.argmax(mean_scores)]} ({max(mean_scores):.2f})")
print(f"‚Ä¢ Lowest Criterion: {criteria_labels[np.argmin(mean_scores)]} ({min(mean_scores):.2f})")
print(f"‚Ä¢ Best Category: {category_scores.index[-1]} ({category_scores.iloc[-1]:.2f})")
print(f"‚Ä¢ Worst Category: {category_scores.index[0]} ({category_scores.iloc[0]:.2f})")


In [None]:
# 2. Detailed Analysis: Potential Issues and Bias Detection
print("üîç DETAILED ANALYSIS:")
print("="*60)

# Find questions with concerning scores
low_accuracy = df[df['factual_accuracy'] <= 2]
if not low_accuracy.empty:
    print(f"\n‚ùå LOW FACTUAL ACCURACY QUESTIONS ({len(low_accuracy)}):")
    for _, row in low_accuracy.iterrows():
        print(f"‚Ä¢ Q{row['id']}: {row['question']}")
        print(f"  Score: {row['factual_accuracy']}/5")

# Find questions with bias concerns
low_neutrality = df[df['neutrality'] <= 3]
if not low_neutrality.empty:
    print(f"\n‚ö†Ô∏è  POTENTIAL BIAS CONCERNS ({len(low_neutrality)}):")
    for _, row in low_neutrality.iterrows():
        print(f"‚Ä¢ Q{row['id']}: {row['question']}")
        print(f"  Neutrality Score: {row['neutrality']}/5")

# Response length analysis
print(f"\nüìè RESPONSE LENGTH ANALYSIS:")
print(f"‚Ä¢ Average GPT Response Length: {df['gpt_response_length'].mean():.1f} words")
print(f"‚Ä¢ Average Ground Truth Length: {df['ground_truth_length'].mean():.1f} words")
print(f"‚Ä¢ Length Correlation with Quality: {df['gpt_response_length'].corr(df['overall_score']):.3f}")

# Category-specific insights
print(f"\nüè• CATEGORY-SPECIFIC INSIGHTS:")
for category in df['category'].unique():
    cat_data = df[df['category'] == category]
    print(f"\n{category} ({len(cat_data)} questions):")
    print(f"  ‚Ä¢ Average Score: {cat_data['overall_score'].mean():.2f}")
    print(f"  ‚Ä¢ Factual Accuracy: {cat_data['factual_accuracy'].mean():.2f}")
    print(f"  ‚Ä¢ Neutrality: {cat_data['neutrality'].mean():.2f}")
    
    # Find the worst performing question in this category
    worst = cat_data.loc[cat_data['overall_score'].idxmin()]
    print(f"  ‚Ä¢ Worst Question: {worst['question'][:50]}... (Score: {worst['overall_score']:.2f})")


In [None]:
## üìù Export Results for Further Analysis


In [None]:
# Export analysis results to CSV
df_export = df[['id', 'category', 'factual_accuracy', 'clarity', 'neutrality', 
               'helpfulness', 'overall_score', 'gpt_response_length']].copy()

# Save to scores.csv
df_export.to_csv('../data/scores.csv', index=False)

# Create a summary report
with open('../results/analysis_summary.txt', 'w') as f:
    f.write("LLM HEALTH EVALUATION - ANALYSIS SUMMARY\n")
    f.write("="*50 + "\n\n")
    
    f.write(f"Dataset: {len(df)} health questions evaluated\n")
    f.write(f"Average Overall Score: {df['overall_score'].mean():.2f}/5.0\n\n")
    
    f.write("SCORES BY CRITERIA:\n")
    for criterion in ['factual_accuracy', 'clarity', 'neutrality', 'helpfulness']:
        f.write(f"‚Ä¢ {criterion.replace('_', ' ').title()}: {df[criterion].mean():.2f}/5.0\n")
    
    f.write(f"\nSCORES BY CATEGORY:\n")
    for category, score in df.groupby('category')['overall_score'].mean().items():
        f.write(f"‚Ä¢ {category}: {score:.2f}/5.0\n")
    
    f.write(f"\nKEY FINDINGS:\n")
    f.write(f"‚Ä¢ Questions with concerning factual accuracy (‚â§2): {len(df[df['factual_accuracy'] <= 2])}\n")
    f.write(f"‚Ä¢ Questions with potential bias concerns (neutrality ‚â§3): {len(df[df['neutrality'] <= 3])}\n")
    f.write(f"‚Ä¢ Average response length: {df['gpt_response_length'].mean():.1f} words\n")

print("‚úÖ Results exported:")
print("‚Ä¢ scores.csv - Detailed scores for each question")
print("‚Ä¢ analysis_summary.txt - Key findings summary")
print("‚Ä¢ overall_performance.png - Main visualization")

print(f"\nüéØ MAIN CONCLUSIONS:")
print(f"‚Ä¢ GPT-4 achieved an average score of {df['overall_score'].mean():.2f}/5.0 across all health questions")
print(f"‚Ä¢ Strongest area: {['Factual Accuracy', 'Clarity', 'Neutrality', 'Helpfulness'][np.argmax([df[c].mean() for c in criteria])]}")
print(f"‚Ä¢ Area for improvement: {['Factual Accuracy', 'Clarity', 'Neutrality', 'Helpfulness'][np.argmin([df[c].mean() for c in criteria])]}")
print(f"‚Ä¢ {len(df[df['factual_accuracy'] <= 2])} questions had concerning factual accuracy scores")
print(f"‚Ä¢ The model performed best on {df.groupby('category')['overall_score'].mean().idxmax()} questions")
