In [None]:
import pandas as pd
from pathlib import Path
from typing import Dict, Tuple, Optional
import re

def calculate_metrics(df: pd.DataFrame) -> Tuple[float, float, int, int, int]:
    """
    Calculate accuracy (excluding abstains), coverage, and abstain count.
    
    Returns:
        accuracy: float (accuracy excluding abstains)
        coverage: float (percentage of non-abstain predictions)
        correct: int (number of correct predictions)
        total_non_abstain: int (total non-abstain predictions)
        abstains: int (number of abstains)
    """
    # Normalize labels to lowercase for comparison
    df = df.copy()
    df['llm_label_lower'] = df['llm_label'].str.lower().str.strip()
    df['gold_label_lower'] = df['gold_label'].str.lower().str.strip()
    
    # Count abstains
    abstains = (df['llm_label_lower'] == 'abstain').sum()
    
    # Filter out abstains
    non_abstain = df[df['llm_label_lower'] != 'abstain']
    
    # Calculate metrics
    total_rows = len(df)
    total_non_abstain = len(non_abstain)
    
    if total_non_abstain > 0:
        correct = (non_abstain['llm_label_lower'] == non_abstain['gold_label_lower']).sum()
        accuracy = correct / total_non_abstain
    else:
        correct = 0
        accuracy = 0.0
    
    # Coverage: percentage of non-abstain predictions
    coverage = (total_non_abstain / total_rows * 100) if total_rows > 0 else 0.0
    
    return accuracy, coverage, correct, total_non_abstain, abstains

def parse_configuration(filename: str) -> Dict[str, str]:
    """
    Parse the filename to extract configuration details.
    
    Returns dict with:
        - prompts: 'Few-shot' or 'Zero-shot'
        - pipeline: 'Baseline', 'EM-RAG', or 'EM-KGRAG'
        - evidence_format: '-', 'Text', 'Python-code', or 'JSON'
        - hop_type: 'One-hop' or 'Multi-hop'
        - top_k: 'k=1' or 'k=3' (extracted from filename)
        - dataset: 'small' or 'mixed'
        - config_key: unique key for matching (prompts_pipeline_evidence_hop_topk)
    """
    filename_lower = filename.lower()
    
    # Extract dataset type
    if filename_lower.startswith('small_'):
        dataset = 'small'
    elif filename_lower.startswith('mixed_'):
        dataset = 'mixed'
    else:
        dataset = 'unknown'
    
    # Extract prompts
    if 'few' in filename_lower:
        prompts = 'Few-shot'
    elif 'zero' in filename_lower:
        prompts = 'Zero-shot'
    else:
        prompts = 'Unknown'
    
    # Extract pipeline
    if 'baseline' in filename_lower:
        pipeline = 'Baseline'
    elif 'rag' in filename_lower:
        pipeline = 'EM-RAG'
    else:
        pipeline = 'EM-KGRAG'
    
    # Extract evidence format
    if 'baseline' in filename_lower:
        evidence_format = '-'
    elif 'rag' in filename_lower:
        evidence_format = 'Text'
    elif 'python' in filename_lower:
        evidence_format = 'Python-code'
    else:
        evidence_format = 'JSON'
    
    # Extract hop type
    if 'one_hop' in filename_lower:
        hop_type = 'One-hop'
    elif 'multi_hop' in filename_lower:
        hop_type = 'Multi-hop'
    else:
        hop_type = 'Unknown'
    
    # Extract top-k value (look for patterns like "_1_", "_3_", "hop_1", "hop_3")
    top_k = 'Unknown'
    # Try to find the k value after "hop_"
    match = re.search(r'hop_(\d+)', filename_lower)
    if match:
        k_value = match.group(1)
        top_k = f'k={k_value}'
    
    # Create a configuration key for matching
    config_key = f"{prompts}_{pipeline}_{evidence_format}_{hop_type}_{top_k}"
    
    return {
        'prompts': prompts,
        'pipeline': pipeline,
        'evidence_format': evidence_format,
        'hop_type': hop_type,
        'top_k': top_k,
        'dataset': dataset,
        'config_key': config_key
    }

def process_dataset(file_path: str) -> Optional[Dict]:
    """
    Process a single dataset and return its metrics and configuration.
    """
    try:
        df = pd.read_csv(file_path)
        accuracy, coverage, correct, total_non_abstain, abstains = calculate_metrics(df)
        
        config = parse_configuration(Path(file_path).name)
        
        return {
            'file': file_path,
            'accuracy': accuracy,
            'coverage': coverage,
            'abstains': abstains,
            'total_rows': len(df),
            **config,
            'success': True
        }
    except FileNotFoundError:
        print(f"Warning: {file_path} not found")
        return None
    except KeyError as e:
        print(f"Warning: Required column not found in {file_path}: {e}")
        return None
    except Exception as e:
        print(f"Warning: Error processing {file_path}: {e}")
        return None

def create_comparison_table(small_files: list, mixed_files: list):
    """
    Create a comparison table showing small and mixed datasets side-by-side.
    """
    # Process all files
    small_results = {}
    mixed_results = {}
    results_dir = 'results/'
    print("\nProcessing files...")
    for file in small_files:
        file = results_dir + file
        result = process_dataset(file)
        if result:
            small_results[result['config_key']] = result
    
    for file in mixed_files:
        file = results_dir + file
        result = process_dataset(file)
        if result:
            mixed_results[result['config_key']] = result
    
    # Get all unique configurations
    all_configs = sorted(set(small_results.keys()) | set(mixed_results.keys()))
    
    if not all_configs:
        print("No configurations found.")
        return
    
    # Print header
    print(f"\n{'='*180}")
    print("SIDE-BY-SIDE COMPARISON: SMALL vs MIXED DATASETS")
    print(f"{'='*180}")
    
    # Column headers
    print(f"\n{'Prompts':<12s} {'Pipeline':<12s} {'Hop Type':<12s} {'Top-K':<8s} {'Evidence':<15s} "
          f"{'Acc_small':<12s} {'Cov_small':<12s} {'Abs_small':<12s} "
          f"{'Acc_mixed':<12s} {'Cov_mixed':<12s} {'Abs_mixed':<12s}")
    print(f"{'-'*180}")
    
    # Print each configuration
    for config_key in all_configs:
        small = small_results.get(config_key)
        mixed = mixed_results.get(config_key)
        
        # Get configuration from whichever exists
        config = small if small else mixed
        if not config:
            continue
        
        prompts = config['prompts']
        pipeline = config['pipeline']
        hop_type = config['hop_type']
        top_k = config['top_k']
        evidence = config['evidence_format']
        
        # Small dataset metrics
        if small:
            acc_small = f"{small['accuracy']*100:.2f}%"
            cov_small = f"{small['coverage']:.2f}%"
            abs_small = f"{small['abstains']}"
        else:
            acc_small = "N/A"
            cov_small = "N/A"
            abs_small = "N/A"
        
        # Mixed dataset metrics
        if mixed:
            acc_mixed = f"{mixed['accuracy']*100:.2f}%"
            cov_mixed = f"{mixed['coverage']:.2f}%"
            abs_mixed = f"{mixed['abstains']}"
        else:
            acc_mixed = "N/A"
            cov_mixed = "N/A"
            abs_mixed = "N/A"
        
        print(f"{prompts:<12s} {pipeline:<12s} {hop_type:<12s} {top_k:<8s} {evidence:<15s} "
              f"{acc_small:<12s} {cov_small:<12s} {abs_small:<12s} "
              f"{acc_mixed:<12s} {cov_mixed:<12s} {abs_mixed:<12s}")
    
    # Print summary statistics
    print(f"\n{'-'*180}")
    print("SUMMARY STATISTICS")
    print(f"{'-'*180}")
    
    # Calculate averages for small
    if small_results:
        avg_acc_small = sum(r['accuracy'] for r in small_results.values()) / len(small_results)
        avg_cov_small = sum(r['coverage'] for r in small_results.values()) / len(small_results)
        total_abs_small = sum(r['abstains'] for r in small_results.values())
        
        print(f"{'SMALL - Average:':<40s} Accuracy: {avg_acc_small*100:.2f}%  |  Coverage: {avg_cov_small:.2f}%  |  Total Abstains: {total_abs_small}")
    
    # Calculate averages for mixed
    if mixed_results:
        avg_acc_mixed = sum(r['accuracy'] for r in mixed_results.values()) / len(mixed_results)
        avg_cov_mixed = sum(r['coverage'] for r in mixed_results.values()) / len(mixed_results)
        total_abs_mixed = sum(r['abstains'] for r in mixed_results.values())
        
        print(f"{'MIXED - Average:':<40s} Accuracy: {avg_acc_mixed*100:.2f}%  |  Coverage: {avg_cov_mixed:.2f}%  |  Total Abstains: {total_abs_mixed}")

def main():
    mixed_var=[
    "mixed_few_baseline.csv",
    "mixed_few_multi_hop_1.csv",
    "mixed_few_multi_hop_3_python.csv",
    "mixed_few_multi_hop_3.csv",
    "mixed_few_one_hop_1.csv",
    "mixed_few_one_hop_3_python.csv",
    "mixed_few_one_hop_3.csv",
    "mixed_few_rag.csv",
    "mixed_zero_baseline.csv",
    "mixed_zero_multi_hop_1.csv",
    "mixed_zero_multi_hop_3_python.csv",
    "mixed_zero_multi_hop_3.csv",
    "mixed_zero_one_hop_1.csv",
    "mixed_zero_one_hop_3_python.csv",
    "mixed_zero_one_hop_3.csv",
    "mixed_zero_rag.csv",
    ]
    small_var=[
    "small_few_baseline.csv",
    "small_few_multi_hop_1.csv",
    "small_few_multi_hop_3_python.csv",
    "small_few_multi_hop_3.csv",
    "small_few_one_hop_1.csv",
    "small_few_one_hop_3_python.csv",
    "small_few_one_hop_3.csv",
    "small_few_rag.csv",
    "small_zero_baseline.csv",
    "small_zero_multi_hop_1.csv",
    "small_zero_multi_hop_3_python.csv",
    "small_zero_multi_hop_3.csv",
    "small_zero_one_hop_1.csv",
    "small_zero_one_hop_3_python.csv",
    "small_zero_one_hop_3.csv",
    "small_zero_rag.csv",
    ]
    
    print("\n" + "="*180)
    print("ACCURACY ANALYSIS: SMALL vs MIXED DATASETS")
    print("="*180)
    print("\nConfiguration Legend:")
    print("  - Prompts: Few-shot or Zero-shot")
    print("  - Pipeline: Baseline | EM-RAG | EM-KGRAG")
    print("  - Hop Type: One-hop | Multi-hop")
    print("  - Top-K: k=1 | k=3 (number of evidence paths retrieved)")
    print("  - Evidence Format: - (baseline) | Text (RAG) | Python-code | JSON")
    print("  - Accuracy: Excluding abstains")
    print("  - Coverage: Percentage of non-abstain predictions")
    
    create_comparison_table(small_var, mixed_var)
    
    print("\n" + "="*180)
    print("ANALYSIS COMPLETE")
    print("="*180)

if __name__ == "__main__":
    main()


ACCURACY ANALYSIS: SMALL vs MIXED DATASETS

Configuration Legend:
  - Prompts: Few-shot or Zero-shot
  - Pipeline: Baseline | EM-RAG | EM-KGRAG
  - Hop Type: One-hop | Multi-hop
  - Top-K: k=1 | k=3 (number of evidence paths retrieved)
  - Evidence Format: - (baseline) | Text (RAG) | Python-code | JSON
  - Accuracy: Excluding abstains
  - Coverage: Percentage of non-abstain predictions

Processing files...

SIDE-BY-SIDE COMPARISON: SMALL vs MIXED DATASETS

Prompts      Pipeline     Hop Type     Top-K    Evidence        Acc_small    Cov_small    Abs_small    Acc_mixed    Cov_mixed    Abs_mixed   
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Few-shot     Baseline     Unknown      Unknown  -               100.00%      31.37%       35           N/A          N/A          N/A         
Few-shot     EM-KGRAG     Multi-hop    k=1      JSON            88.57%   