In [2]:
import json
import os
from collections import defaultdict, Counter

# Configuration
ANNOTATION_DIR = r"C:\Users\SouayedBelkiss\OneDrive - gae\Desktop\Thesis\annotation_analysis\expert work"
ANNOTATION_FILES = {
    'Expert_A': 'alexa_annotations.json',
    'Expert_K': 'katrin_annotations.json', 
    'Expert_L': 'luisa_annotations.json',
    'Expert_M': 'martin_annotations.json'
}

def load_annotations(expert_name, filename):
    """Load annotations from JSON file"""
    filepath = os.path.join(ANNOTATION_DIR, filename)
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception as e:
        print(f"Error loading {expert_name} annotations: {e}")
        return []

def extract_field_data(annotations, field_name):
    """Extract all values for a specific field across all annotations"""
    field_values = []
    
    for record in annotations:
        if 'annotations' in record and len(record['annotations']) > 0:
            annotation = record['annotations'][0]
            if 'result' in annotation:
                for item in annotation['result']:
                    if item.get('from_name') == field_name:
                        field_values.append({
                            'image': record['data'].get('original_filename', 'unknown'),
                            'value': item.get('value', {}),
                            'type': item.get('type', 'unknown')
                        })
    
    return field_values

def analyze_field_structure():
    """Analyze the structure of concern and yes/no fields across all experts"""
    
    print("=" * 80)
    print("ANALYZING CONCERN AND YES/NO FIELD STRUCTURES")
    print("=" * 80)
    
    # Fields to analyze
    target_fields = ['concerns', 'no_text', 'well_separated_obj']
    
    all_experts_data = {}
    
    # Load data for each expert
    for expert_name, filename in ANNOTATION_FILES.items():
        print(f"\n📁 Loading {expert_name} annotations...")
        annotations = load_annotations(expert_name, filename)
        print(f"   Loaded {len(annotations)} records")
        
        all_experts_data[expert_name] = {}
        
        # Extract data for each target field
        for field in target_fields:
            field_data = extract_field_data(annotations, field)
            all_experts_data[expert_name][field] = field_data
            print(f"   {field}: {len(field_data)} entries found")
    
    # Detailed analysis for each field
    for field in target_fields:
        print(f"\n" + "="*60)
        print(f"DETAILED ANALYSIS: {field.upper()}")
        print("="*60)
        
        for expert_name in ANNOTATION_FILES.keys():
            field_data = all_experts_data[expert_name][field]
            print(f"\n--- {expert_name} ---")
            print(f"Total entries: {len(field_data)}")
            
            if len(field_data) > 0:
                # Analyze value structure
                value_types = Counter()
                sample_values = []
                
                for entry in field_data[:10]:  # First 10 samples
                    value = entry['value']
                    value_types[str(type(value))] += 1
                    sample_values.append(value)
                
                print(f"Value types: {dict(value_types)}")
                print("Sample values:")
                for i, val in enumerate(sample_values[:5]):
                    print(f"  {i+1}: {val}")
                
                # For concerns field, analyze choice patterns
                if field == 'concerns':
                    all_concerns = []
                    empty_count = 0
                    
                    for entry in field_data:
                        choices = entry['value'].get('choices', [])
                        if choices:
                            all_concerns.extend(choices)
                        else:
                            empty_count += 1
                    
                    print(f"Empty concern responses: {empty_count}")
                    print(f"Total concern selections: {len(all_concerns)}")
                    
                    if all_concerns:
                        concern_counts = Counter(all_concerns)
                        print("Top concerns:")
                        for concern, count in concern_counts.most_common(10):
                            print(f"  '{concern}': {count}")
                
                # For yes/no fields, analyze response patterns
                elif field in ['no_text', 'well_separated_obj']:
                    response_counts = Counter()
                    empty_count = 0
                    
                    for entry in field_data:
                        choices = entry['value'].get('choices', [])
                        if choices:
                            # Assume single choice for yes/no questions
                            response_counts[choices[0]] += 1
                        else:
                            empty_count += 1
                    
                    print(f"Empty responses: {empty_count}")
                    print("Response distribution:")
                    for response, count in response_counts.items():
                        percentage = (count / len(field_data)) * 100
                        print(f"  '{response}': {count} ({percentage:.1f}%)")
            
            else:
                print("No data found for this field")
    
    # Cross-expert comparison
    print(f"\n" + "="*60)
    print("CROSS-EXPERT COMPARISON")
    print("="*60)
    
    for field in target_fields:
        print(f"\n{field.upper()} - Coverage by Expert:")
        coverage_data = []
        
        for expert_name in ANNOTATION_FILES.keys():
            field_data = all_experts_data[expert_name][field]
            total_annotations = len(load_annotations(expert_name, ANNOTATION_FILES[expert_name]))
            coverage = len(field_data) / total_annotations * 100 if total_annotations > 0 else 0
            coverage_data.append({
                'Expert': expert_name,
                'Field_Entries': len(field_data),
                'Total_Annotations': total_annotations,
                'Coverage_Percent': coverage
            })
            print(f"  {expert_name}: {len(field_data)}/{total_annotations} ({coverage:.1f}%)")
        
        # Calculate summary statistics manually
        coverages = [item['Coverage_Percent'] for item in coverage_data]
        avg_coverage = sum(coverages) / len(coverages) if coverages else 0
        min_coverage = min(coverages) if coverages else 0
        max_coverage = max(coverages) if coverages else 0
        
        print(f"\nSummary statistics for {field}:")
        print(f"  Average coverage: {avg_coverage:.1f}%")
        print(f"  Coverage range: {min_coverage:.1f}% - {max_coverage:.1f}%")

# Run the analysis
if __name__ == "__main__":
    analyze_field_structure()

ANALYZING CONCERN AND YES/NO FIELD STRUCTURES

📁 Loading Expert_A annotations...
   Loaded 250 records
   concerns: 240 entries found
   no_text: 250 entries found
   well_separated_obj: 250 entries found

📁 Loading Expert_K annotations...
   Loaded 250 records
   concerns: 241 entries found
   no_text: 250 entries found
   well_separated_obj: 250 entries found

📁 Loading Expert_L annotations...
   Loaded 200 records
   concerns: 139 entries found
   no_text: 200 entries found
   well_separated_obj: 200 entries found

📁 Loading Expert_M annotations...
   Loaded 276 records
   concerns: 49 entries found
   no_text: 276 entries found
   well_separated_obj: 276 entries found

DETAILED ANALYSIS: CONCERNS

--- Expert_A ---
Total entries: 240
Value types: {"<class 'dict'>": 10}
Sample values:
  1: {'choices': ['Too complex for target audience', 'Misleading representation', 'Poor accessibility']}
  2: {'choices': ['Too complex for target audience', 'Misleading representation', 'Poor accessibi

In [7]:
def main():
    """Main analysis function"""
    print("🚀 Starting Concern Pattern and Yes/No Analysis")
    print("="*80)
    
    # Load all data
    all_data = {}
    for expert_name, filename in ANNOTATION_FILES.items():
        print(f"Loading {expert_name} data...")
        annotations = load_annotations(expert_name, filename)
        all_data[expert_name] = extract_structured_data(annotations, expert_name)
        print(f"  Extracted data for {len(all_data[expert_name])} images")
    
    # Run analyses
    overall_concern_counts, expert_concern_counts, concern_pairs = analyze_concern_patterns(all_data)
    
    # Try to load mapping file for style analysis
    mapping_df = None
    try:
        # You can uncomment and adjust this path when you want style analysis
        # mapping_df = pd.read_csv('path_to_your_mapping_file.csv')
        # print("✅ Mapping file loaded for style analysis")
        pass
    except:
        print("⚠️  No mapping file found - skipping style-specific analysis")
    
    analyze_yesno_patterns(all_data, mapping_df)
    
    print("\n🎉 Analysis complete!")
    print("="*80)
import os
from collections import defaultdict, Counter
import numpy as np
from itertools import combinations

# Configuration
ANNOTATION_DIR = r"C:\Users\SouayedBelkiss\OneDrive - gae\Desktop\Thesis\annotation_analysis\expert work"
ANNOTATION_FILES = {
    'Expert_A': 'alexa_annotations.json',
    'Expert_K': 'katrin_annotations.json', 
    'Expert_L': 'luisa_annotations.json',
    'Expert_M': 'martin_annotations.json'
}

# Define all possible concerns based on your data
ALL_CONCERNS = [
    'Too complex for target audience',
    'Misleading representation', 
    'Poor accessibility',
    'Discriminatory content',
    'Culturally insensitive',
    'Potentially triggering'
]

def load_annotations(expert_name, filename):
    """Load annotations from JSON file"""
    filepath = os.path.join(ANNOTATION_DIR, filename)
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception as e:
        print(f"Error loading {expert_name} annotations: {e}")
        return []

def extract_structured_data(annotations, expert_name):
    """Extract structured data for analysis"""
    data = []
    
    for record in annotations:
        if 'annotations' in record and len(record['annotations']) > 0:
            annotation = record['annotations'][0]
            if 'result' in annotation:
                
                # Initialize record data
                record_data = {
                    'expert': expert_name,
                    'image': record['data'].get('original_filename', 'unknown'),
                    'concerns': [],
                    'no_text': None,
                    'well_separated_obj': None
                }
                
                # Extract data from result items
                for item in annotation['result']:
                    field_name = item.get('from_name')
                    value = item.get('value', {})
                    
                    if field_name == 'concerns':
                        record_data['concerns'] = value.get('choices', [])
                    elif field_name == 'no_text':
                        choices = value.get('choices', [])
                        record_data['no_text'] = choices[0] if choices else None
                    elif field_name == 'well_separated_obj':
                        choices = value.get('choices', [])
                        record_data['well_separated_obj'] = choices[0] if choices else None
                
                data.append(record_data)
    
    return data

def analyze_concern_patterns(all_data):
    """Analyze concern patterns across experts"""
    print("\n" + "="*80)
    print("🚨 CONCERN PATTERN ANALYSIS")
    print("="*80)
    
    # 1. Most common concerns overall
    print("\n📊 MOST COMMON CONCERNS (Overall)")
    print("-" * 50)
    
    all_concerns = []
    expert_concern_counts = defaultdict(Counter)
    
    for expert_data in all_data.values():
        for record in expert_data:
            concerns = record['concerns']
            all_concerns.extend(concerns)
            for concern in concerns:
                expert_concern_counts[record['expert']][concern] += 1
    
    overall_concern_counts = Counter(all_concerns)
    
    for concern, count in overall_concern_counts.most_common():
        percentage = (count / len(all_concerns)) * 100 if all_concerns else 0
        print(f"{concern:35}: {count:3d} ({percentage:5.1f}%)")
    
    # 1.5. Most picked concern by each expert
    print(f"\n🏆 MOST PICKED CONCERN BY EACH EXPERT")
    print("-" * 50)
    
    for expert_name in ['Expert_A', 'Expert_K', 'Expert_L', 'Expert_M']:
        expert_counts = expert_concern_counts[expert_name]
        if expert_counts:
            top_concern, top_count = expert_counts.most_common(1)[0]
            total_expert_concerns = sum(expert_counts.values())
            percentage = (top_count / total_expert_concerns) * 100
            print(f"{expert_name}: '{top_concern}' ({top_count}/{total_expert_concerns} = {percentage:.1f}%)")
        else:
            print(f"{expert_name}: No concerns reported")
    
    # 1.6. Expert specialty/focus analysis
    print(f"\n🎯 EXPERT CONCERN SPECIALTIES")
    print("-" * 50)
    
    # Calculate each expert's concern distribution
    for expert_name in ['Expert_A', 'Expert_K', 'Expert_L', 'Expert_M']:
        expert_counts = expert_concern_counts[expert_name]
        total_expert = sum(expert_counts.values())
        
        if total_expert > 0:
            print(f"\n{expert_name} concern breakdown:")
            for concern, count in expert_counts.most_common():
                pct = (count / total_expert) * 100
                print(f"  {concern:<35}: {count:3d} ({pct:5.1f}%)")
        else:
            print(f"\n{expert_name}: No concerns reported")
    
    # 2. Concern patterns by expert
    print(f"\n📋 CONCERN FREQUENCY BY EXPERT")
    print("-" * 50)
    print(f"{'Concern':<35} {'A':>6} {'K':>6} {'L':>6} {'M':>6} {'Total':>7}")
    print("-" * 72)
    
    for concern in ALL_CONCERNS:
        counts = [expert_concern_counts[expert][concern] for expert in ['Expert_A', 'Expert_K', 'Expert_L', 'Expert_M']]
        total = sum(counts)
        print(f"{concern:<35} {counts[0]:6d} {counts[1]:6d} {counts[2]:6d} {counts[3]:6d} {total:7d}")
    
    # 3. Concern co-occurrence analysis
    print(f"\n🔗 CONCERN CO-OCCURRENCE ANALYSIS")
    print("-" * 50)
    
    # Find images with multiple concerns
    multi_concern_records = []
    for expert_data in all_data.values():
        for record in expert_data:
            if len(record['concerns']) > 1:
                multi_concern_records.append(record)
    
    print(f"Images with multiple concerns: {len(multi_concern_records)}")
    
    # Analyze pairs of co-occurring concerns
    concern_pairs = Counter()
    for record in multi_concern_records:
        concerns = record['concerns']
        for pair in combinations(sorted(concerns), 2):
            concern_pairs[pair] += 1
    
    print(f"\nTop concern combinations:")
    for pair, count in concern_pairs.most_common(10):
        print(f"  {pair[0]} + {pair[1]}: {count}")
    
    # 4. Expert agreement on concerns (for images annotated by multiple experts)
    print(f"\n🤝 EXPERT AGREEMENT ON CONCERNS")
    print("-" * 50)
    
    # Group by image
    image_concerns = defaultdict(list)
    for expert_data in all_data.values():
        for record in expert_data:
            image_concerns[record['image']].append({
                'expert': record['expert'],
                'concerns': set(record['concerns'])
            })
    
    # Only analyze images annotated by multiple experts
    multi_expert_images = {img: data for img, data in image_concerns.items() if len(data) > 1}
    
    print(f"Images annotated by multiple experts: {len(multi_expert_images)}")
    
    if multi_expert_images:
        agreement_scores = []
        for img, expert_data in multi_expert_images.items():
            if len(expert_data) >= 2:
                # Calculate pairwise agreement
                pairs = list(combinations(expert_data, 2))
                for pair in pairs:
                    concerns1 = pair[0]['concerns']
                    concerns2 = pair[1]['concerns']
                    
                    if len(concerns1) == 0 and len(concerns2) == 0:
                        agreement = 1.0  # Both have no concerns
                    elif len(concerns1) == 0 or len(concerns2) == 0:
                        agreement = 0.0  # One has concerns, other doesn't
                    else:
                        # Jaccard similarity
                        intersection = len(concerns1.intersection(concerns2))
                        union = len(concerns1.union(concerns2))
                        agreement = intersection / union if union > 0 else 0
                    
                    agreement_scores.append(agreement)
        
        if agreement_scores:
            avg_agreement = np.mean(agreement_scores)
            print(f"Average concern agreement (Jaccard): {avg_agreement:.3f}")
            print(f"Agreement range: {min(agreement_scores):.3f} - {max(agreement_scores):.3f}")
    
    return overall_concern_counts, expert_concern_counts, concern_pairs

def analyze_yesno_patterns(all_data, mapping_df=None):
    """Analyze Yes/No question patterns"""
    print("\n" + "="*80)
    print("✅ YES/NO QUESTION ANALYSIS")
    print("="*80)
    
    yesno_fields = ['no_text', 'well_separated_obj']
    
    for field in yesno_fields:
        print(f"\n📊 {field.upper().replace('_', ' ')} ANALYSIS")
        print("-" * 50)
        
        # Overall response distribution
        field_responses = defaultdict(Counter)
        all_responses = []
        
        for expert_name, expert_data in all_data.items():
            for record in expert_data:
                response = record[field]
                if response:
                    field_responses[expert_name][response] += 1
                    all_responses.append(response)
        
        # Overall distribution
        overall_counts = Counter(all_responses)
        total_responses = len(all_responses)
        
        print("Overall distribution:")
        for response, count in overall_counts.items():
            percentage = (count / total_responses) * 100
            print(f"  {response}: {count} ({percentage:.1f}%)")
        
        # By expert
        print(f"\nDistribution by expert:")
        print(f"{'Expert':<10} {'Yes':>6} {'No':>6} {'Yes %':>8}")
        print("-" * 32)
        
        for expert_name in ANNOTATION_FILES.keys():
            yes_count = field_responses[expert_name]['Yes']
            no_count = field_responses[expert_name]['No']
            total_expert = yes_count + no_count
            yes_pct = (yes_count / total_expert * 100) if total_expert > 0 else 0
            
            print(f"{expert_name:<10} {yes_count:6d} {no_count:6d} {yes_pct:7.1f}%")
        
        # Expert agreement analysis
        print(f"\n🤝 Expert Agreement for {field.replace('_', ' ').title()}")
        print("-" * 40)
        
        # Group responses by image
        image_responses = defaultdict(list)
        for expert_name, expert_data in all_data.items():
            for record in expert_data:
                if record[field]:
                    image_responses[record['image']].append({
                        'expert': expert_name,
                        'response': record[field]
                    })
        
        # Calculate agreement for images with multiple annotations
        multi_expert_images = {img: responses for img, responses in image_responses.items() if len(responses) > 1}
        
        if multi_expert_images:
            agreement_count = 0
            total_comparisons = 0
            
            for img, responses in multi_expert_images.items():
                # Check if all experts agree
                unique_responses = set(r['response'] for r in responses)
                if len(unique_responses) == 1:
                    agreement_count += 1
                total_comparisons += 1
            
            agreement_rate = (agreement_count / total_comparisons) * 100 if total_comparisons > 0 else 0
            print(f"Images with multiple annotations: {total_comparisons}")
            print(f"Perfect agreement: {agreement_count}/{total_comparisons} ({agreement_rate:.1f}%)")
        
        # Style-specific analysis (if mapping provided)
        if mapping_df is not None:
            print(f"\n🎨 {field.replace('_', ' ').title()} by Style")
            print("-" * 40)
            
            # Create image to style mapping
            style_mapping = dict(zip(mapping_df['new_filename'], mapping_df['style']))
            
            style_responses = defaultdict(Counter)
            
            for expert_name, expert_data in all_data.items():
                for record in expert_data:
                    image_file = record['image']
                    if image_file in style_mapping and record[field]:
                        style = style_mapping[image_file]
                        style_responses[style][record[field]] += 1
            
            # Display style-specific patterns
            for style in sorted(style_responses.keys()):
                yes_count = style_responses[style]['Yes']
                no_count = style_responses[style]['No']
                total_style = yes_count + no_count
                yes_pct = (yes_count / total_style * 100) if total_style > 0 else 0
                
                print(f"{style:<15}: {yes_count:3d} Yes, {no_count:3d} No ({yes_pct:5.1f}% Yes)")



def main():
    """Main analysis function"""
    print("🚀 Starting Concern Pattern and Yes/No Analysis")
    print("="*80)
    
    # Load all data
    all_data = {}
    for expert_name, filename in ANNOTATION_FILES.items():
        print(f"Loading {expert_name} data...")
        annotations = load_annotations(expert_name, filename)
        all_data[expert_name] = extract_structured_data(annotations, expert_name)
        print(f"  Extracted data for {len(all_data[expert_name])} images")
    
    # Run analyses
    overall_concern_counts, expert_concern_counts, concern_pairs = analyze_concern_patterns(all_data)
    
    # Try to load mapping file for style analysis
    mapping_df = None
    try:
        # Assuming you have a mapping file - adjust path as needed
        mapping_df = pd.read_csv('path_to_your_mapping_file.csv')
        print("✅ Mapping file loaded for style analysis")
    except:
        print("⚠️  No mapping file found - skipping style-specific analysis")
    
    analyze_yesno_patterns(all_data, mapping_df)
    
    # Create visualizations
    create_visualizations(overall_concern_counts, expert_concern_counts, all_data)
    
    print("\n🎉 Analysis complete!")
    print("="*80)

if __name__ == "__main__":
    main()

🚀 Starting Concern Pattern and Yes/No Analysis
Loading Expert_A data...
  Extracted data for 250 images
Loading Expert_K data...
  Extracted data for 250 images
Loading Expert_L data...
  Extracted data for 200 images
Loading Expert_M data...
  Extracted data for 276 images

🚨 CONCERN PATTERN ANALYSIS

📊 MOST COMMON CONCERNS (Overall)
--------------------------------------------------
Misleading representation          : 574 ( 44.7%)
Poor accessibility                 : 343 ( 26.7%)
Too complex for target audience    : 299 ( 23.3%)
Discriminatory content             :  37 (  2.9%)
Potentially triggering             :  19 (  1.5%)
Culturally insensitive             :  11 (  0.9%)

🏆 MOST PICKED CONCERN BY EACH EXPERT
--------------------------------------------------
Expert_A: 'Misleading representation' (220/564 = 39.0%)
Expert_K: 'Misleading representation' (234/450 = 52.0%)
Expert_L: 'Misleading representation' (110/216 = 50.9%)
Expert_M: 'Discriminatory content' (15/53 = 28.3%)

🎯 E

NameError: name 'create_visualizations' is not defined