In [None]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, gmean
import os
import warnings

class OrderingImpactAnalyzer:
    """
    Analyze the impact of criteria presentation order on LLM performance 
    in multi-criteria decision making tasks.
    """
    
    def __init__(self):
        # Expert benchmark weights (can be modified for different applications)
        self.expert_weights = {
            'Solar Radiation': 0.235,
            'Aspect': 0.146,
            'Aspect (Slope Direction)': 0.146,  # Keep both for compatibility
            'Land Use': 0.144,
            'Slope': 0.075,
            'Distance to Power Transmission Lines': 0.064,
            'Distance to Settlements': 0.064,
            'Distance to Transformers': 0.058,
            'Altitude': 0.048,
            'Distance to Protected Areas': 0.040,
            'Distance to Highways': 0.035,
            'Distance from Fault Lines': 0.029,
            'Distance to Airports': 0.022,
            'Distance to Rivers and Lakes': 0.022,
            'Distance to Bird Migration Routes': 0.018
        }
    
    def safe_geometric_mean(self, values):
        """
        Calculate geometric mean safely, handling zeros and negative values.
        """
        values = np.array(values)
        
        if np.any(values <= 0):
            warnings.warn("Found zero or negative values. Using arithmetic mean instead.")
            return np.mean(values)
        
        try:
            return gmean(values)
        except Exception as e:
            warnings.warn(f"Error calculating geometric mean: {e}. Using arithmetic mean.")
            return np.mean(values)
    
    def calculate_correlation_with_benchmark(self, model_weights, criteria_names):
        """Calculate Pearson correlation with benchmark weights"""
        
        benchmark_weights_ordered = []
        model_weights_ordered = []
        
        for criterion in criteria_names:
            if criterion in self.expert_weights:
                benchmark_weights_ordered.append(self.expert_weights[criterion])
                model_weights_ordered.append(model_weights[criterion])
            else:
                print(f"Warning: Criterion '{criterion}' not found in benchmark weights")
        
        if len(benchmark_weights_ordered) != len(model_weights_ordered):
            print(f"Warning: Mismatch in number of criteria")
            return None
        
        try:
            pearson_r, _ = pearsonr(benchmark_weights_ordered, model_weights_ordered)
            return pearson_r
        except Exception as e:
            print(f"Error calculating correlation: {e}")
            return None
    
    def get_random_order_performance(self, consolidated_df, baseline_approach="minimal"):
        """
        Get random order performance using geometric mean aggregation.
        """
        
        # Find baseline approach
        baseline_data = None
        for approach in consolidated_df['Prompting_Approach'].unique():
            approach_lower = approach.lower().replace('_', ' ')
            if baseline_approach.lower() in approach_lower:
                baseline_data = consolidated_df[consolidated_df['Prompting_Approach'] == approach]
                print(f"Found random order approach: {approach}")
                break
        
        if baseline_data is None:
            print("Warning: Using first available approach as random order")
            first_approach = consolidated_df['Prompting_Approach'].unique()[0]
            baseline_data = consolidated_df[consolidated_df['Prompting_Approach'] == first_approach]
            print(f"Using approach: {first_approach}")
        
        # Get criteria columns
        criteria_columns = [col for col in baseline_data.columns 
                          if col not in ['Prompting_Approach', 'Model', 'Iteration']]
        
        # Calculate geometric mean weights for each model
        models = baseline_data['Model'].unique()
        random_order_results = {}
        
        for model in models:
            model_data = baseline_data[baseline_data['Model'] == model]
            
            geometric_mean_weights = {}
            for col in criteria_columns:
                values = model_data[col].values
                geometric_mean_weights[col] = self.safe_geometric_mean(values)
            
            correlation = self.calculate_correlation_with_benchmark(
                geometric_mean_weights, criteria_columns
            )
            if correlation is not None:
                random_order_results[model] = correlation
        
        return random_order_results
    
    def get_expert_aligned_performance(self, expert_aligned_df):
        """Get performance for expert-aligned ordering condition"""
        
        expert_aligned_results = {}
        model_columns = [col for col in expert_aligned_df.columns if col != 'Criterion']
        
        for model_col in model_columns:
            # Clean model name
            model_name = model_col.replace('-', '').replace('_', '')
            
            # Create weight dictionary
            model_weights = {}
            for idx, row in expert_aligned_df.iterrows():
                criterion = row['Criterion']
                weight = row[model_col]
                model_weights[criterion] = weight
            
            # Calculate correlation
            criteria_names = expert_aligned_df['Criterion'].tolist()
            correlation = self.calculate_correlation_with_benchmark(
                model_weights, criteria_names
            )
            
            if correlation is not None:
                expert_aligned_results[model_name] = correlation
        
        return expert_aligned_results
    
    def normalize_model_name(self, name):
        """Normalize model names for comparison"""
        return name.lower().replace('-', '').replace('_', '').replace('.', '').replace(' ', '')
    
    def match_models(self, random_order_results, expert_aligned_results):
        """Match models between random order and expert-aligned conditions"""
        
        random_order_models = list(random_order_results.keys())
        expert_aligned_models = list(expert_aligned_results.keys())
        
        matched_models = []
        matched_random = set()
        matched_expert = set()
        
        # Exact matches first
        for expert_model in expert_aligned_models:
            expert_normalized = self.normalize_model_name(expert_model)
            
            for random_model in random_order_models:
                random_normalized = self.normalize_model_name(random_model)
                
                if expert_normalized == random_normalized:
                    matched_models.append((expert_model, random_model))
                    matched_random.add(random_model)
                    matched_expert.add(expert_model)
                    break
        
        # Flexible matching for unmatched models
        unmatched_expert = [m for m in expert_aligned_models if m not in matched_expert]
        unmatched_random = [m for m in random_order_models if m not in matched_random]
        
        for expert_model in unmatched_expert:
            expert_normalized = self.normalize_model_name(expert_model)
            best_match = None
            best_score = 0
            
            for random_model in unmatched_random:
                random_normalized = self.normalize_model_name(random_model)
                
                # Calculate similarity
                if expert_normalized in random_normalized:
                    score = len(expert_normalized) / len(random_normalized)
                    if score > best_score and score > 0.8:
                        best_score = score
                        best_match = random_model
                elif random_normalized in expert_normalized:
                    score = len(random_normalized) / len(expert_normalized)
                    if score > best_score and score > 0.8:
                        best_score = score
                        best_match = random_model
            
            if best_match:
                matched_models.append((expert_model, best_match))
                matched_random.add(best_match)
                matched_expert.add(expert_model)
                unmatched_random.remove(best_match)
        
        return matched_models
    
    def create_display_name(self, model_name):
        """Convert model name to display format"""
        name_mapping = {
            'Claude37': 'Claude-3.7',
            'Claude37Thinking': 'Claude-3.7-Thinking',
            'DeepSeekR1': 'DeepSeek-R1',
            'GPT41': 'GPT-4.1',
            'Gemini25Pro': 'Gemini-2.5-Pro',
            'o3': 'o3'
        }
        return name_mapping.get(model_name, model_name)
    
    def analyze_ordering_impact(self, random_order_weights_path, expert_aligned_weights_path):
        """
        Main analysis function to compare random order vs expert-aligned ordering performance.
        
        Parameters:
        random_order_weights_path (str): Path to random order condition weights
        expert_aligned_weights_path (str): Path to expert-aligned condition weights
        
        Returns:
        pd.DataFrame: Results comparing performance between conditions
        """
        
        try:
            random_order_df = pd.read_csv(random_order_weights_path)
            expert_aligned_df = pd.read_csv(expert_aligned_weights_path)
        except FileNotFoundError as e:
            print(f"Error: File not found - {e}")
            return None
        
        print("Calculating random order performance...")
        random_order_results = self.get_random_order_performance(random_order_df)
        
        print("Calculating expert-aligned performance...")
        expert_aligned_results = self.get_expert_aligned_performance(expert_aligned_df)
        
        print("Matching models between conditions...")
        matched_models = self.match_models(random_order_results, expert_aligned_results)
        
        # Create results table
        results_data = []
        
        for expert_model, random_model in matched_models:
            random_order_r = random_order_results[random_model]
            expert_aligned_r = expert_aligned_results[expert_model]
            
            delta_r = expert_aligned_r - random_order_r
            percent_change = (delta_r / random_order_r) * 100
            
            display_name = self.create_display_name(expert_model)
            
            results_data.append({
                'Model': display_name,
                'Random Order': round(random_order_r, 3),
                'Expert-Aligned': round(expert_aligned_r, 3),
                'Δ Pearson r': f"{delta_r:+.3f}",
                '% Change': f"{percent_change:+.1f}%"
            })
        
        # Calculate overall average
        if results_data:
            random_values = [result['Random Order'] for result in results_data]
            expert_values = [result['Expert-Aligned'] for result in results_data]
            
            avg_random = np.mean(random_values)
            avg_expert = np.mean(expert_values)
            avg_delta = avg_expert - avg_random
            avg_percent = (avg_delta / avg_random) * 100
            
            results_data.append({
                'Model': 'Overall Average',
                'Random Order': round(avg_random, 3),
                'Expert-Aligned': round(avg_expert, 3),
                'Δ Pearson r': f"{avg_delta:+.3f}",
                '% Change': f"{avg_percent:+.1f}%"
            })
        
        results_df = pd.DataFrame(results_data)
        return results_df
    
    def display_results(self, results_df, title="Ordering Impact Analysis"):
        """Display results in formatted table"""
        
        if results_df is None or results_df.empty:
            print("No results to display")
            return
        
        print("\n" + "="*80)
        print(title)
        print("="*80)
        print(results_df.to_string(index=False))
        print("="*80)
    
    def save_results(self, results_df, output_dir="output", filename="ordering_impact_analysis.csv"):
        """Save results to CSV file"""
        
        if results_df is None or results_df.empty:
            print("No results to save")
            return
        
        os.makedirs(output_dir, exist_ok=True)
        output_path = os.path.join(output_dir, filename)
        results_df.to_csv(output_path, index=False)
        print(f"Results saved to {output_path}")
    
    def get_summary_statistics(self, results_df):
        """Generate summary statistics from results"""
        
        if results_df is None or results_df.empty:
            return None
        
        individual_models = results_df[results_df['Model'] != 'Overall Average']
        
        if individual_models.empty:
            return None
        
        # Extract numeric values
        percent_changes = []
        for _, row in individual_models.iterrows():
            percent_str = row['% Change']
            percent_val = float(percent_str.replace('%', '').replace('+', ''))
            percent_changes.append(percent_val)
        
        summary = {
            'models_analyzed': len(individual_models),
            'positive_impacts': len([p for p in percent_changes if p > 0]),
            'negative_impacts': len([p for p in percent_changes if p < 0]),
            'max_improvement': max(percent_changes) if percent_changes else 0,
            'max_degradation': min(percent_changes) if percent_changes else 0,
            'overall_change': results_df[results_df['Model'] == 'Overall Average']['% Change'].iloc[0] 
                            if not results_df[results_df['Model'] == 'Overall Average'].empty else "N/A"
        }
        
        return summary


def main():
    """Main execution function"""
    
    # Initialize analyzer
    analyzer = OrderingImpactAnalyzer()
    
    # File paths - modify these for your data
    random_order_weights_path = "individual_weights_iterations.csv"
    expert_aligned_weights_path = "expert_aligned_order_weights.csv"
    
    print("Starting ordering impact analysis...")
    
    # Perform analysis
    results = analyzer.analyze_ordering_impact(
        random_order_weights_path, 
        expert_aligned_weights_path
    )
    
    if results is not None:
        # Display results
        analyzer.display_results(results, "Criteria Ordering Impact on LLM Performance")
        
        # Save results
        analyzer.save_results(results)
        
        # Print summary
        summary = analyzer.get_summary_statistics(results)
        if summary:
            print(f"\nSummary:")
            print(f"- Models analyzed: {summary['models_analyzed']}")
            print(f"- Models with improvement: {summary['positive_impacts']}")
            print(f"- Models with degradation: {summary['negative_impacts']}")
            print(f"- Maximum improvement: +{summary['max_improvement']:.1f}%")
            print(f"- Maximum degradation: {summary['max_degradation']:+.1f}%")
            print(f"- Overall average change: {summary['overall_change']}")
        
        print("\nAnalysis completed successfully!")
    else:
        print("Analysis failed. Please check your input files.")


if __name__ == "__main__":
    main()