In [None]:
import os
import subprocess
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [None]:
class LogicalFallaciesEvaluator:
    def __init__(self, api_key):
        self.api_key = api_key
        os.environ['OPENAI_API_SECRET_KEY'] = api_key
        
        # Models to evaluate
        self.openai_models = {
            'gpt-4o': 'gpt-4o',
            'gpt-4o-mini': 'gpt-4o-mini', 
            'o1-preview': 'o1-preview',
            'o1-mini': 'o1-mini',
        }
        
        # Open source models for comparison
        self.open_source_models = {
            'llama-2-70b': 'meta-llama/Llama-2-70b-chat-hf',
            'llama-3-70b': 'meta-llama/Meta-Llama-3-70B-Instruct',
        }
        
        # Logical fallacy tasks
        self.benchmark_tasks = [
            'bigbench_formal_fallacies_syllogisms_negation',
            'bigbench_logical_deduction_three_objects',
            'bigbench_logical_deduction_five_objects',
            'bigbench_logical_deduction_seven_objects',
        ]
        
        # Custom fallacy tasks we'll create
        self.custom_tasks = [
            'custom_ad_hominem',
            'custom_straw_man',
            'custom_false_dichotomy',
            'custom_circular_reasoning',
        ]
        
        self.results = {}
    
    def run_evaluation(self, model_name, model_id, tasks):
        results = {}
        
        for task in tasks:
            print(f"Evaluating {model_name} on {task}...")
            
            try:
                # Construct the command
                cmd = [
                    'python', 'main.py',
                    'lm_eval',
                    '--model', 'openai-completions',
                    '--model_args', f'model={model_id}',
                    '--tasks', task,
                    '--output_path', f'results/{model_name}_{task}.json',
                    '--log_samples'
                ]
                
                # Run the evaluation
                result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600)
                
                if result.returncode == 0:
                    # Parse results
                    with open(f'results/{model_name}_{task}.json', 'r') as f:
                        task_results = json.load(f)
                    
                    results[task] = {
                        'accuracy': task_results.get('results', {}).get(task, {}).get('acc', 0),
                        'acc_norm': task_results.get('results', {}).get(task, {}).get('acc_norm', 0),
                        'samples': task_results.get('samples', [])
                    }
                else:
                    print(f"Error evaluating {model_name} on {task}: {result.stderr}")
                    results[task] = {'accuracy': 0, 'acc_norm': 0, 'error': result.stderr}
                    
            except Exception as e:
                print(f"Exception evaluating {model_name} on {task}: {e}")
                results[task] = {'accuracy': 0, 'acc_norm': 0, 'error': str(e)}
        
        return results
    
    def create_custom_fallacy_tasks(self):
        
        # Ad Hominem fallacy examples
        ad_hominem_examples = [
            {
                "input": "John argues that we should increase funding for education. But John dropped out of high school, so his argument is invalid.",
                "target": "ad_hominem",
                "explanation": "Attacks the person making the argument rather than the argument itself"
            },
            {
                "input": "Sarah says we need stricter environmental regulations. But she drives an SUV, so she's clearly wrong.",
                "target": "ad_hominem", 
                "explanation": "Dismisses the argument based on perceived hypocrisy rather than merit"
            }
        ]
        
        # Straw Man fallacy examples
        straw_man_examples = [
            {
                "input": "Person A: 'We should have some gun control measures.' Person B: 'So you want to take away all guns and leave us defenseless!'",
                "target": "straw_man",
                "explanation": "Misrepresents the original argument to make it easier to attack"
            }
        ]
        
        # Create task files (simplified - in practice you'd create proper JSONL files)
        custom_tasks = {
            'ad_hominem': ad_hominem_examples,
            'straw_man': straw_man_examples,
        }
        
        return custom_tasks
    
    def evaluate_all_models(self):
        
        # Create results directory
        os.makedirs('results', exist_ok=True)
        
        # Evaluate OpenAI models
        for model_name, model_id in self.openai_models.items():
            print(f"\n=== Evaluating {model_name} ===")
            self.results[model_name] = self.run_evaluation(
                model_name, model_id, self.benchmark_tasks
            )
        
        # Note: For open source models, you'd need different evaluation setup
        # This is a placeholder for the structure
        
        return self.results
    
    def analyze_results(self):
        
        # Convert results to DataFrame for easier analysis
        data = []
        for model, tasks in self.results.items():
            for task, metrics in tasks.items():
                if 'error' not in metrics:
                    data.append({
                        'model': model,
                        'task': task,
                        'accuracy': metrics['accuracy'],
                        'acc_norm': metrics['acc_norm']
                    })
        
        df = pd.DataFrame(data)
        
        # Calculate average performance per model
        model_avg = df.groupby('model').agg({
            'accuracy': 'mean',
            'acc_norm': 'mean'
        }).round(3)
        
        print("\n=== Model Performance Summary ===")
        print(model_avg)
        
        # Calculate performance per task
        task_performance = df.pivot_table(
            index='task', 
            columns='model', 
            values='accuracy'
        ).round(3)
        
        print("\n=== Task Performance Breakdown ===")
        print(task_performance)
        
        return df, model_avg, task_performance
    
    def create_visualizations(self, df):
        
        # Set up the plotting style
        plt.style.use('seaborn-v0_8')
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        
        # 1. Overall model comparison
        model_avg = df.groupby('model')['accuracy'].mean().sort_values(ascending=True)
        axes[0,0].barh(model_avg.index, model_avg.values)
        axes[0,0].set_title('Average Accuracy by Model')
        axes[0,0].set_xlabel('Accuracy')
        
        # 2. Performance by task
        task_pivot = df.pivot_table(index='model', columns='task', values='accuracy')
        sns.heatmap(task_pivot, annot=True, cmap='YlOrRd', ax=axes[0,1])
        axes[0,1].set_title('Accuracy Heatmap by Model and Task')
        
        # 3. Model comparison boxplot
        sns.boxplot(data=df, x='model', y='accuracy', ax=axes[1,0])
        axes[1,0].set_title('Accuracy Distribution by Model')
        axes[1,0].tick_params(axis='x', rotation=45)
        
        # 4. Task difficulty ranking
        task_avg = df.groupby('task')['accuracy'].mean().sort_values(ascending=True)
        axes[1,1].barh(task_avg.index, task_avg.values)
        axes[1,1].set_title('Average Task Difficulty (Lower = Harder)')
        axes[1,1].set_xlabel('Average Accuracy Across Models')
        
        plt.tight_layout()
        plt.savefig('logical_fallacies_comparison.png', dpi=300, bbox_inches='tight')
        plt.show()

In [None]:
def main():
    
    # Initialize evaluator (you'll need to provide your OpenAI API key)
    api_key = os.getenv('OPENAI_API_SECRET_KEY')
    if not api_key:
        print("Please set your OPENAI_API_SECRET_KEY environment variable")
        return
    
    evaluator = LogicalFallaciesEvaluator(api_key)
    
    # Run evaluations
    print("Starting model evaluations...")
    results = evaluator.evaluate_all_models()
    
    # Analyze results
    df, model_avg, task_performance = evaluator.analyze_results()
    
    # Create visualizations
    evaluator.create_visualizations(df)
    

    print("\nEvaluation complete! Check 'logical_fallacies_comparison.png' and 'logical_fallacies_report.md'")


In [None]:
if __name__ == "__main__":
    main()