In [None]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, spearmanr, gmean
import os
import warnings

class LLMPerformanceAnalyzer:
    """
    A tool for analyzing LLM performance in multi-criteria decision making tasks.
    Compares LLM-generated weights against expert consensus using various statistical metrics.
    """
    
    def __init__(self, expert_weights_dict):
        """
        Initialize the analyzer with expert consensus weights.
        
        Parameters:
        expert_weights_dict (dict): Dictionary mapping criterion names to expert weights
        """
        self.expert_weights = expert_weights_dict
        self.results = {}
    
    def safe_geometric_mean(self, values):
        """
        Calculate geometric mean safely, handling zeros and negative values.
        
        Parameters:
        values (array-like): Values to calculate geometric mean for
        
        Returns:
        float: Geometric mean, or arithmetic mean if geometric mean cannot be calculated
        """
        values = np.array(values)
        
        # Check for non-positive values
        if np.any(values <= 0):
            warnings.warn("Found zero or negative values. Using arithmetic mean instead of geometric mean.")
            return np.mean(values)
        
        try:
            return gmean(values)
        except Exception as e:
            warnings.warn(f"Error calculating geometric mean: {e}. Using arithmetic mean instead.")
            return np.mean(values)
    
    def calculate_performance_metrics(self, weights_df, cr_df, group_by='Method'):
        """
        Calculate performance metrics comparing LLM outputs to expert consensus.
        
        Parameters:
        weights_df (DataFrame): DataFrame containing LLM weights with columns for criteria
        cr_df (DataFrame): DataFrame containing consistency ratios
        group_by (str): Column name to group by (e.g., 'Method', 'Model', 'Approach')
        
        Returns:
        dict: Dictionary mapping group values to performance DataFrames
        """
        
        # Get available grouping values
        if group_by not in weights_df.columns:
            print(f"Available columns: {list(weights_df.columns)}")
            raise ValueError(f"Column '{group_by}' not found in weights DataFrame")
        
        groups = weights_df[group_by].unique()
        print(f"Found groups: {groups}")
        
        # Get criteria columns (exclude metadata columns)
        metadata_columns = [group_by, 'Model', 'Iteration', 'Run', 'ID']
        criteria_columns = [col for col in weights_df.columns 
                          if col not in metadata_columns]
        print(f"Found {len(criteria_columns)} criteria columns")
        
        results = {}
        
        print(f"Processing {len(groups)} groups for {group_by}")
        
        for group_value in groups:
            print(f"\nProcessing group: {group_value}")
            
            # Filter data for the specific group
            weights_filtered = weights_df[weights_df[group_by] == group_value].copy()
            cr_filtered = cr_df[cr_df[group_by] == group_value].copy() if not cr_df.empty else pd.DataFrame()
            
            if weights_filtered.empty:
                print(f"Warning: No data found for {group_by}: {group_value}")
                continue
            
            # Get unique models/entities within this group
            if 'Model' in weights_filtered.columns:
                entities = weights_filtered['Model'].unique()
                entity_col = 'Model'
            else:
                # If no Model column, treat each row as a separate entity
                weights_filtered['Entity'] = range(len(weights_filtered))
                entities = weights_filtered['Entity'].unique()
                entity_col = 'Entity'
            
            group_results = []
            
            for entity in entities:
                # Get entity data
                entity_weights = weights_filtered[weights_filtered[entity_col] == entity]
                
                # Calculate mean CR if available
                mean_cr = None
                if not cr_filtered.empty and entity_col in cr_filtered.columns:
                    entity_cr = cr_filtered[cr_filtered[entity_col] == entity]
                    if not entity_cr.empty and 'CR' in entity_cr.columns:
                        mean_cr = entity_cr['CR'].mean()
                
                # Calculate geometric mean weights across iterations
                geometric_mean_weights = {}
                
                for col in criteria_columns:
                    if col in entity_weights.columns:
                        values = entity_weights[col].dropna().values
                        if len(values) > 0:
                            geometric_mean_weights[col] = self.safe_geometric_mean(values)
                
                if not geometric_mean_weights:
                    print(f"Warning: No valid weights found for entity {entity}")
                    continue
                
                # Prepare expert weights in the same order as data columns
                expert_weights_ordered = []
                llm_weights_ordered = []
                
                for col in criteria_columns:
                    if col in self.expert_weights and col in geometric_mean_weights:
                        expert_weights_ordered.append(self.expert_weights[col])
                        llm_weights_ordered.append(geometric_mean_weights[col])
                
                if len(expert_weights_ordered) < 2:
                    print(f"Warning: Insufficient matching criteria for entity {entity}")
                    continue
                
                # Calculate performance metrics
                try:
                    pearson_r, pearson_p = pearsonr(expert_weights_ordered, llm_weights_ordered)
                    spearman_r, spearman_p = spearmanr(expert_weights_ordered, llm_weights_ordered)
                    
                    # Calculate RMSE
                    rmse = np.sqrt(np.mean((np.array(expert_weights_ordered) - 
                                          np.array(llm_weights_ordered))**2))
                    
                    result_row = {
                        'Entity': entity,
                        'Pearson_r': round(pearson_r, 3),
                        'Pearson_p': pearson_p,
                        'Spearman_rho': round(spearman_r, 3),
                        'Spearman_p': spearman_p,
                        'RMSE': round(rmse, 3),
                        'N_Criteria': len(expert_weights_ordered)
                    }
                    
                    if mean_cr is not None:
                        result_row['Mean_CR'] = round(mean_cr, 3)
                    
                    group_results.append(result_row)
                    
                    print(f"  {entity}: Pearson r={pearson_r:.3f}, RMSE={rmse:.3f}")
                    
                except Exception as e:
                    print(f"Error calculating metrics for entity {entity}: {e}")
                    continue
            
            results[group_value] = pd.DataFrame(group_results)
        
        self.results = results
        return results
    
    def format_p_values(self, df, p_columns=['Pearson_p', 'Spearman_p']):
        """
        Format p-values for display (show <0.001 for very small p-values).
        
        Parameters:
        df (DataFrame): DataFrame containing p-value columns
        p_columns (list): List of p-value column names
        
        Returns:
        DataFrame: DataFrame with formatted p-value columns
        """
        df_formatted = df.copy()
        
        for col in p_columns:
            if col in df_formatted.columns:
                df_formatted[col + '_formatted'] = df_formatted[col].apply(
                    lambda x: "<0.001" if x < 0.001 else f"{x:.3f}"
                )
        
        return df_formatted
    
    def generate_summary_statistics(self, results=None):
        """
        Generate summary statistics across all groups.
        
        Parameters:
        results (dict): Results dictionary (uses self.results if None)
        
        Returns:
        DataFrame: Summary statistics for each group
        """
        if results is None:
            results = self.results
        
        summary_data = []
        
        for group_name, group_df in results.items():
            if group_df.empty:
                continue
            
            summary = {
                'Group': group_name,
                'N_Entities': len(group_df),
                'Mean_Pearson_r': group_df['Pearson_r'].mean(),
                'Std_Pearson_r': group_df['Pearson_r'].std(),
                'Mean_Spearman_rho': group_df['Spearman_rho'].mean(),
                'Std_Spearman_rho': group_df['Spearman_rho'].std(),
                'Mean_RMSE': group_df['RMSE'].mean(),
                'Std_RMSE': group_df['RMSE'].std(),
                'Best_Pearson_r': group_df['Pearson_r'].max(),
                'Best_Entity': group_df.loc[group_df['Pearson_r'].idxmax(), 'Entity']
            }
            
            if 'Mean_CR' in group_df.columns:
                summary['Mean_CR'] = group_df['Mean_CR'].mean()
                summary['Std_CR'] = group_df['Mean_CR'].std()
            
            summary_data.append(summary)
        
        return pd.DataFrame(summary_data)
    
    def save_results(self, output_dir="output", filename_prefix="llm_analysis"):
        """
        Save analysis results to CSV files.
        
        Parameters:
        output_dir (str): Directory to save files in
        filename_prefix (str): Prefix for output filenames
        """
        os.makedirs(output_dir, exist_ok=True)
        
        # Save individual group results
        for group_name, group_df in self.results.items():
            if not group_df.empty:
                # Clean group name for filename
                clean_name = "".join(c for c in group_name if c.isalnum() or c in (' ', '-', '_')).rstrip()
                clean_name = clean_name.replace(' ', '_').lower()
                
                filename = f"{filename_prefix}_{clean_name}.csv"
                filepath = os.path.join(output_dir, filename)
                
                # Format for display
                display_df = self.format_p_values(group_df)
                display_df.to_csv(filepath, index=False)
                print(f"Saved {group_name} results to {filepath}")
        
        # Save summary statistics
        summary_df = self.generate_summary_statistics()
        summary_filepath = os.path.join(output_dir, f"{filename_prefix}_summary.csv")
        summary_df.to_csv(summary_filepath, index=False)
        print(f"Saved summary statistics to {summary_filepath}")
        
        return summary_df
    
    def display_results(self, max_groups=None):
        """
        Display analysis results in a formatted way.
        
        Parameters:
        max_groups (int): Maximum number of groups to display (None for all)
        """
        print("="*80)
        print("LLM PERFORMANCE ANALYSIS RESULTS")
        print("="*80)
        
        groups_to_show = list(self.results.keys())[:max_groups] if max_groups else self.results.keys()
        
        for group_name in groups_to_show:
            group_df = self.results[group_name]
            if group_df.empty:
                continue
            
            print(f"\nGroup: {group_name}")
            print("-" * 60)
            
            # Format p-values for display
            display_df = self.format_p_values(group_df)
            
            # Select columns for display
            display_columns = ['Entity', 'Pearson_r', 'Pearson_p_formatted', 
                             'Spearman_rho', 'Spearman_p_formatted', 'RMSE']
            if 'Mean_CR' in display_df.columns:
                display_columns.append('Mean_CR')
            
            # Only show columns that exist
            display_columns = [col for col in display_columns if col in display_df.columns]
            
            print(display_df[display_columns].to_string(index=False))
        
        # Display summary
        print(f"\n{'='*80}")
        print("SUMMARY STATISTICS")
        print("="*80)
        
        summary_df = self.generate_summary_statistics()
        if not summary_df.empty:
            # Round numeric columns for display
            numeric_columns = summary_df.select_dtypes(include=[np.number]).columns
            for col in numeric_columns:
                if col != 'N_Entities':
                    summary_df[col] = summary_df[col].round(3)
            
            print(summary_df.to_string(index=False))
        
        print(f"\n{'='*80}")


def load_and_analyze(weights_file, expert_weights_dict, cr_file=None, 
                    group_by='Method', output_dir="output"):
    """
    Convenience function to load data and run analysis.
    
    Parameters:
    weights_file (str): Path to CSV file containing LLM weights
    expert_weights_dict (dict): Dictionary of expert consensus weights
    cr_file (str): Path to CSV file containing consistency ratios (optional)
    group_by (str): Column name to group analysis by
    output_dir (str): Directory to save results
    
    Returns:
    LLMPerformanceAnalyzer: Configured analyzer instance
    """
    
    # Load data
    try:
        weights_df = pd.read_csv(weights_file)
        print(f"Loaded weights data: {weights_df.shape}")
        print(f"Columns in weights file: {list(weights_df.columns)}")
        
        cr_df = pd.DataFrame()
        if cr_file and os.path.exists(cr_file):
            cr_df = pd.read_csv(cr_file)
            print(f"Loaded consistency ratio data: {cr_df.shape}")
            print(f"Columns in CR file: {list(cr_df.columns)}")
        
    except Exception as e:
        print(f"Error loading data: {e}")
        return None
    
    # Initialize analyzer
    analyzer = LLMPerformanceAnalyzer(expert_weights_dict)
    
    # Run analysis
    results = analyzer.calculate_performance_metrics(weights_df, cr_df, group_by)
    
    # Display and save results
    analyzer.display_results()
    summary_df = analyzer.save_results(output_dir)
    
    return analyzer


# Example usage
if __name__ == "__main__":
    
    # Expert weights from research (modify these for your use case)
    expert_weights = {
        'Solar Radiation': 0.235,
        'Aspect (Slope Direction)': 0.146,
        'Land Use': 0.144,
        'Slope': 0.075,
        'Distance to Power Transmission Lines': 0.064,
        'Distance to Settlements': 0.064,
        'Distance to Transformers': 0.058,
        'Altitude': 0.048,
        'Distance to Protected Areas': 0.040,
        'Distance to Highways': 0.035,
        'Distance from Fault Lines': 0.029,
        'Distance to Airports': 0.022,
        'Distance to Rivers and Lakes': 0.022,
        'Distance to Bird Migration Routes': 0.018
    }
    
    # File paths
    consistency_ratios_path = "data/consistency_ratios.csv"
    consolidated_weights_path = "data/individual_weights_iterations.csv"
    
    print("LLM Performance Analysis Tool")
    print("="*50)
    
    # Check if files exist
    if not os.path.exists(consolidated_weights_path):
        print(f"Error: Could not find weights file: {consolidated_weights_path}")
        print("Please make sure the file exists in the current directory.")
    elif not os.path.exists(consistency_ratios_path):
        print(f"Warning: Could not find consistency ratios file: {consistency_ratios_path}")
        print("Proceeding without consistency ratio analysis.")
        
        # Run analysis without CR file
        analyzer = load_and_analyze(
            weights_file=consolidated_weights_path,
            expert_weights_dict=expert_weights,
            cr_file=None,
            group_by='Prompting_Approach',  # Assuming this column exists
            output_dir="output"
        )
    else:
        print("Found both data files. Running full analysis...")
        
        # Run complete analysis
        analyzer = load_and_analyze(
            weights_file=consolidated_weights_path,
            expert_weights_dict=expert_weights,
            cr_file=consistency_ratios_path,
            group_by='Prompting_Approach',  # Common column name
            output_dir="output"
        )
    
    if analyzer:
        print("\nAnalysis completed successfully!")
        print("Results saved to 'output' directory.")
    else:
        print("\nAnalysis failed. Please check your data files and try again.")