In [None]:
import json
import os
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
from typing import List, Dict
import pandas as pd

class HumanBenchmarkAnalyzer:
    def __init__(self):
        self.all_results = []
        self.aggregated_results = None

    def load_json_files(self, directory_path: str) -> None:
        """Load all JSON files from the specified directory."""
        json_files = [f for f in os.listdir(directory_path) if f.endswith('.json')]

        for file_name in json_files:
            file_path = os.path.join(directory_path, file_name)
            try:
                with open(file_path, 'r') as f:
                    responses = json.load(f)
                result = self.analyze_single_benchmark(responses, file_name)
                self.all_results.append(result)
            except Exception as e:
                print(f"Error processing {file_name}: {str(e)}")

    def analyze_single_benchmark(self, responses: List[Dict], file_name: str) -> Dict:
        """Analyze a single benchmark response set."""
        y_true = []
        y_pred = []

        # Statistics for each category
        stats = {
            'legitimate': {'TP': 0, 'FP': 0, 'TN': 0, 'FN': 0},
            'manual-fake': {'TP': 0, 'FP': 0, 'TN': 0, 'FN': 0},
            'gpt': {'TP': 0, 'FP': 0, 'TN': 0, 'FN': 0}
        }

        for response in responses:
            actual = response['actualType']
            predicted = response['userResponse']

            y_true.append(actual)
            y_pred.append(predicted)

            # Update statistics for each category
            for category in stats.keys():
                is_actual = (actual == category)
                is_predicted = (predicted == category)

                if is_actual and is_predicted:
                    stats[category]['TP'] += 1
                elif not is_actual and is_predicted:
                    stats[category]['FP'] += 1
                elif is_actual and not is_predicted:
                    stats[category]['FN'] += 1
                else:
                    stats[category]['TN'] += 1

        # Calculate metrics
        def calculate_metrics(stat):
            precision = stat['TP'] / (stat['TP'] + stat['FP']) if (stat['TP'] + stat['FP']) > 0 else 0
            recall = stat['TP'] / (stat['TP'] + stat['FN']) if (stat['TP'] + stat['FN']) > 0 else 0
            far = stat['FP'] / (stat['FP'] + stat['TN']) if (stat['FP'] + stat['TN']) > 0 else 0
            frr = stat['FN'] / (stat['TP'] + stat['FN']) if (stat['TP'] + stat['FN']) > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
            return {
                'precision': precision,
                'recall': recall,
                'FAR': far,
                'FRR': frr,
                'F1': f1
            }

        # Calculate confusion matrix
        labels = ['legitimate', 'manual-fake', 'gpt']
        conf_matrix = confusion_matrix(y_true, y_pred, labels=labels).tolist()

        return {
            'file_name': file_name,
            'overall_accuracy': accuracy_score(y_true, y_pred),
            'total_samples': len(responses),
            'confusion_matrix': conf_matrix,
            'legitimate': calculate_metrics(stats['legitimate']),
            'manual_fake': calculate_metrics(stats['manual-fake']),
            'gpt': calculate_metrics(stats['gpt'])
        }

    def aggregate_results(self) -> Dict:
        """Aggregate results across all benchmarks."""
        if not self.all_results:
            return None

        # Initialize aggregated metrics
        agg_metrics = {
            'overall_accuracy': [],
            'legitimate': {'precision': [], 'recall': [], 'FAR': [], 'FRR': [], 'F1': []},
            'manual_fake': {'precision': [], 'recall': [], 'FAR': [], 'FRR': [], 'F1': []},
            'gpt': {'precision': [], 'recall': [], 'FAR': [], 'FRR': [], 'F1': []},
            'total_samples': 0,
            'confusion_matrices': []
        }

        # Collect metrics from all results
        for result in self.all_results:
            agg_metrics['overall_accuracy'].append(result['overall_accuracy'])
            agg_metrics['total_samples'] += result['total_samples']
            agg_metrics['confusion_matrices'].append(result['confusion_matrix'])

            for category in ['legitimate', 'manual_fake', 'gpt']:
                for metric in ['precision', 'recall', 'FAR', 'FRR', 'F1']:
                    agg_metrics[category][metric].append(result[category][metric])

        # Calculate summary statistics
        self.aggregated_results = {
            'number_of_benchmarks': len(self.all_results),
            'total_samples': agg_metrics['total_samples'],
            'overall_accuracy': {
                'mean': np.mean(agg_metrics['overall_accuracy']),
                'std': np.std(agg_metrics['overall_accuracy']),
                'min': np.min(agg_metrics['overall_accuracy']),
                'max': np.max(agg_metrics['overall_accuracy'])
            }
        }

        # Calculate category-specific statistics
        for category in ['legitimate', 'manual_fake', 'gpt']:
            self.aggregated_results[category] = {}
            for metric in ['precision', 'recall', 'FAR', 'FRR', 'F1']:
                values = agg_metrics[category][metric]
                self.aggregated_results[category][metric] = {
                    'mean': np.mean(values),
                    'std': np.std(values),
                    'min': np.min(values),
                    'max': np.max(values)
                }

        # Calculate average confusion matrix
        self.aggregated_results['average_confusion_matrix'] = np.mean(
            agg_metrics['confusion_matrices'], axis=0).tolist()

        return self.aggregated_results

    def print_detailed_results(self) -> None:
        """Print detailed analysis results."""
        if not self.aggregated_results:
            print("No results to display. Please run analysis first.")
            return

        print("\nAggregated Human Benchmarking Results")
        print("=" * 60)
        print(f"Number of benchmarks analyzed: {self.aggregated_results['number_of_benchmarks']}")
        print(f"Total samples across all benchmarks: {self.aggregated_results['total_samples']}")

        print("\nOverall Accuracy:")
        acc = self.aggregated_results['overall_accuracy']
        print(f"Mean: {acc['mean']:.2%} ± {acc['std']:.2%}")
        print(f"Range: [{acc['min']:.2%}, {acc['max']:.2%}]")

        print("\nMetrics by Category:")
        print("-" * 60)

        categories = ['legitimate', 'manual_fake', 'gpt']
        metrics = ['precision', 'recall', 'FAR', 'FRR', 'F1']

        for category in categories:
            print(f"\n{category.replace('_', ' ').title()} Profiles:")
            for metric in metrics:
                stats = self.aggregated_results[category][metric]
                print(f"{metric.upper()}:")
                print(f"  Mean: {stats['mean']:.2%} ± {stats['std']:.2%}")
                print(f"  Range: [{stats['min']:.2%}, {stats['max']:.2%}]")

        print("\nAverage Confusion Matrix:")
        print("-" * 60)
        print("Actual vs Predicted (legitimate, manual-fake, gpt)")
        for row in self.aggregated_results['average_confusion_matrix']:
            print([f"{x:.2f}" for x in row])

    def export_results_to_csv(self, output_path: str) -> None:
        """Export detailed results to CSV files."""
        if not self.aggregated_results:
            print("No results to export. Please run analysis first.")
            return

        # Create detailed results dataframe
        detailed_results = []
        for result in self.all_results:
            row = {
                'file_name': result['file_name'],
                'accuracy': result['overall_accuracy'],
                'total_samples': result['total_samples']
            }

            # Add category-specific metrics
            for category in ['legitimate', 'manual_fake', 'gpt']:
                for metric in ['precision', 'recall', 'FAR', 'FRR', 'F1']:
                    row[f"{category}_{metric}"] = result[category][metric]

            detailed_results.append(row)

        # Create summary dataframe
        summary_data = []
        metrics = ['mean', 'std', 'min', 'max']

        # Add overall accuracy
        for metric in metrics:
            row = {'metric': metric, 'overall_accuracy': self.aggregated_results['overall_accuracy'][metric]}

            # Add category-specific metrics
            for category in ['legitimate', 'manual_fake', 'gpt']:
                for measure in ['precision', 'recall', 'FAR', 'FRR', 'F1']:
                    row[f"{category}_{measure}"] = self.aggregated_results[category][measure][metric]

            summary_data.append(row)

        # Export to CSV
        pd.DataFrame(detailed_results).to_csv(f"{output_path}_detailed.csv", index=False)
        pd.DataFrame(summary_data).to_csv(f"{output_path}_summary.csv", index=False)

# Example usage:
analyzer = HumanBenchmarkAnalyzer()
analyzer.load_json_files('survey_responses')  # Directory containing JSON files
analyzer.aggregate_results()
analyzer.print_detailed_results()
analyzer.export_results_to_csv('human_benchmark_results')


Aggregated Human Benchmarking Results
Number of benchmarks analyzed: 30
Total samples across all benchmarks: 450

Overall Accuracy:
Mean: 41.56% ± 16.03%
Range: [13.33%, 80.00%]

Metrics by Category:
------------------------------------------------------------

Legitimate Profiles:
PRECISION:
  Mean: 39.74% ± 21.88%
  Range: [0.00%, 80.00%]
RECALL:
  Mean: 53.10% ± 32.30%
  Range: [0.00%, 100.00%]
FAR:
  Mean: 38.63% ± 18.80%
  Range: [9.09%, 77.78%]
FRR:
  Mean: 46.90% ± 32.30%
  Range: [0.00%, 100.00%]
F1:
  Mean: 43.79% ± 24.70%
  Range: [0.00%, 83.33%]

Manual Fake Profiles:
PRECISION:
  Mean: 44.86% ± 21.00%
  Range: [0.00%, 100.00%]
RECALL:
  Mean: 40.10% ± 21.56%
  Range: [0.00%, 100.00%]
FAR:
  Mean: 24.14% ± 12.86%
  Range: [0.00%, 50.00%]
FRR:
  Mean: 59.90% ± 21.56%
  Range: [0.00%, 100.00%]
F1:
  Mean: 41.17% ± 19.66%
  Range: [0.00%, 80.00%]

Gpt Profiles:
PRECISION:
  Mean: 36.69% ± 29.47%
  Range: [0.00%, 100.00%]
RECALL:
  Mean: 31.22% ± 26.73%
  Range: [0.00%, 100.00%

In [None]:
import json
import os
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
from typing import List, Dict
import pandas as pd

class BinaryBenchmarkAnalyzer:
    def __init__(self):
        self.all_results = []
        self.aggregated_results = None

    def load_json_files(self, directory_path: str) -> None:
        json_files = [f for f in os.listdir(directory_path) if f.endswith('.json')]

        for file_name in json_files:
            file_path = os.path.join(directory_path, file_name)
            try:
                with open(file_path, 'r') as f:
                    responses = json.load(f)
                result = self.analyze_single_benchmark(responses, file_name)
                self.all_results.append(result)
            except Exception as e:
                print(f"Error processing {file_name}: {str(e)}")

    def analyze_single_benchmark(self, responses: List[Dict], file_name: str) -> Dict:
        y_true = []
        y_pred = []

        # Statistics for binary classification
        stats = {
            'legitimate': {'TP': 0, 'FP': 0, 'TN': 0, 'FN': 0}
        }

        for response in responses:
            # Convert actual type to binary (legitimate or fake)
            actual = 'legitimate' if response['actualType'] == 'legitimate' else 'fake'

            # Convert predicted type to binary
            predicted = 'legitimate' if response['userResponse'] == 'legitimate' else 'fake'

            y_true.append(actual)
            y_pred.append(predicted)

            # Update binary statistics
            is_actual_legitimate = (actual == 'legitimate')
            is_predicted_legitimate = (predicted == 'legitimate')

            if is_actual_legitimate and is_predicted_legitimate:
                stats['legitimate']['TP'] += 1
            elif not is_actual_legitimate and is_predicted_legitimate:
                stats['legitimate']['FP'] += 1
            elif is_actual_legitimate and not is_predicted_legitimate:
                stats['legitimate']['FN'] += 1
            else:
                stats['legitimate']['TN'] += 1

        # Calculate metrics
        def calculate_metrics(stat):
            precision = stat['TP'] / (stat['TP'] + stat['FP']) if (stat['TP'] + stat['FP']) > 0 else 0
            recall = stat['TP'] / (stat['TP'] + stat['FN']) if (stat['TP'] + stat['FN']) > 0 else 0
            far = stat['FP'] / (stat['FP'] + stat['TN']) if (stat['FP'] + stat['TN']) > 0 else 0
            frr = stat['FN'] / (stat['TP'] + stat['FN']) if (stat['TP'] + stat['FN']) > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
            return {
                'precision': precision,
                'recall': recall,
                'FAR': far,
                'FRR': frr,
                'F1': f1
            }

        # Calculate binary confusion matrix
        conf_matrix = confusion_matrix(y_true, y_pred, labels=['legitimate', 'fake']).tolist()

        return {
            'file_name': file_name,
            'overall_accuracy': accuracy_score(y_true, y_pred),
            'total_samples': len(responses),
            'confusion_matrix': conf_matrix,
            'metrics': calculate_metrics(stats['legitimate'])
        }

    def aggregate_results(self) -> Dict:
        if not self.all_results:
            return None

        # Initialize aggregated metrics
        agg_metrics = {
            'overall_accuracy': [],
            'metrics': {
                'precision': [], 'recall': [], 'FAR': [], 'FRR': [], 'F1': []
            },
            'total_samples': 0,
            'confusion_matrices': []
        }

        # Collect metrics from all results
        for result in self.all_results:
            agg_metrics['overall_accuracy'].append(result['overall_accuracy'])
            agg_metrics['total_samples'] += result['total_samples']
            agg_metrics['confusion_matrices'].append(result['confusion_matrix'])

            for metric in ['precision', 'recall', 'FAR', 'FRR', 'F1']:
                agg_metrics['metrics'][metric].append(result['metrics'][metric])

        # Calculate summary statistics
        self.aggregated_results = {
            'number_of_benchmarks': len(self.all_results),
            'total_samples': agg_metrics['total_samples'],
            'overall_accuracy': {
                'mean': np.mean(agg_metrics['overall_accuracy']),
                'std': np.std(agg_metrics['overall_accuracy']),
                'min': np.min(agg_metrics['overall_accuracy']),
                'max': np.max(agg_metrics['overall_accuracy'])
            }
        }

        # Calculate metrics statistics
        self.aggregated_results['metrics'] = {}
        for metric in ['precision', 'recall', 'FAR', 'FRR', 'F1']:
            values = agg_metrics['metrics'][metric]
            self.aggregated_results['metrics'][metric] = {
                'mean': np.mean(values),
                'std': np.std(values),
                'min': np.min(values),
                'max': np.max(values)
            }

        # Calculate average confusion matrix
        self.aggregated_results['average_confusion_matrix'] = np.mean(
            agg_metrics['confusion_matrices'], axis=0).tolist()

        return self.aggregated_results

    def print_detailed_results(self) -> None:
        if not self.aggregated_results:
            print("No results to display. Please run analysis first.")
            return

        print("\nAggregated Binary Classification Results")
        print("=" * 60)
        print(f"Number of benchmarks analyzed: {self.aggregated_results['number_of_benchmarks']}")
        print(f"Total samples across all benchmarks: {self.aggregated_results['total_samples']}")

        print("\nOverall Accuracy:")
        acc = self.aggregated_results['overall_accuracy']
        print(f"Mean: {acc['mean']:.2%} ± {acc['std']:.2%}")
        print(f"Range: [{acc['min']:.2%}, {acc['max']:.2%}]")

        print("\nClassification Metrics:")
        print("-" * 60)
        metrics = ['precision', 'recall', 'FAR', 'FRR', 'F1']

        for metric in metrics:
            stats = self.aggregated_results['metrics'][metric]
            print(f"{metric.upper()}:")
            print(f"  Mean: {stats['mean']:.2%} ± {stats['std']:.2%}")
            print(f"  Range: [{stats['min']:.2%}, {stats['max']:.2%}]")

        print("\nAverage Confusion Matrix:")
        print("-" * 60)
        print("Actual vs Predicted (legitimate, fake)")
        for row in self.aggregated_results['average_confusion_matrix']:
            print([f"{x:.2f}" for x in row])

    def export_results_to_csv(self, output_path: str) -> None:
        if not self.aggregated_results:
            print("No results to export. Please run analysis first.")
            return

        # Create detailed results dataframe
        detailed_results = []
        for result in self.all_results:
            row = {
                'file_name': result['file_name'],
                'accuracy': result['overall_accuracy'],
                'total_samples': result['total_samples']
            }

            # Add metrics
            for metric in ['precision', 'recall', 'FAR', 'FRR', 'F1']:
                row[metric] = result['metrics'][metric]

            detailed_results.append(row)

        # Create summary dataframe
        summary_data = []
        metrics = ['mean', 'std', 'min', 'max']

        for metric in metrics:
            row = {'metric': metric, 'overall_accuracy': self.aggregated_results['overall_accuracy'][metric]}

            # Add performance metrics
            for measure in ['precision', 'recall', 'FAR', 'FRR', 'F1']:
                row[measure] = self.aggregated_results['metrics'][measure][metric]

            summary_data.append(row)

        # Export to CSV
        pd.DataFrame(detailed_results).to_csv(f"{output_path}_detailed.csv", index=False)
        pd.DataFrame(summary_data).to_csv(f"{output_path}_summary.csv", index=False)

# Example usage:
analyzer = BinaryBenchmarkAnalyzer()
analyzer.load_json_files('survey_responses')  # Directory containing JSON files
analyzer.aggregate_results()
analyzer.print_detailed_results()
analyzer.export_results_to_csv('human_benchmark_results')


Aggregated Binary Classification Results
Number of benchmarks analyzed: 30
Total samples across all benchmarks: 450

Overall Accuracy:
Mean: 58.67% ± 16.55%
Range: [33.33%, 86.67%]

Classification Metrics:
------------------------------------------------------------
PRECISION:
  Mean: 39.74% ± 21.88%
  Range: [0.00%, 80.00%]
RECALL:
  Mean: 53.10% ± 32.30%
  Range: [0.00%, 100.00%]
FAR:
  Mean: 38.63% ± 18.80%
  Range: [9.09%, 77.78%]
FRR:
  Mean: 46.90% ± 32.30%
  Range: [0.00%, 100.00%]
F1:
  Mean: 43.79% ± 24.70%
  Range: [0.00%, 83.33%]

Average Confusion Matrix:
------------------------------------------------------------
Actual vs Predicted (legitimate, fake)
['2.67', '2.33']
['3.87', '6.13']
