In [8]:
import random
import json
from collections import Counter, defaultdict
from typing import List, Dict, Any, Tuple
import re
from dataclasses import dataclass
import statistics

In [9]:
@dataclass
class GenerationResult:
    route: List[str]
    reasoning: str
    confidence_score: float
    response_time: float

class MultiGenerationAnalyzer:
    def __init__(self, num_generations: int = 10):

        self.cities = ["Amsterdam", "Warsaw", "Hamburg", "Barcelona", "Delhi", "Shanghai", "Toronto"]
        self.num_generations = num_generations
        
        # Define the "ground truth" optimal routes for comparison
        self.optimal_routes = {
            "geographical": ["Toronto", "Amsterdam", "Hamburg", "Warsaw", "Barcelona", "Delhi", "Shanghai"],
            "hub_based": ["Toronto", "Amsterdam", "Warsaw", "Hamburg", "Barcelona", "Delhi", "Shanghai"],
            "time_zone": ["Toronto", "Amsterdam", "Hamburg", "Warsaw", "Barcelona", "Delhi", "Shanghai"]
        }
        
        # Temperature settings for different generation attempts
        self.temperature_settings = [0.3, 0.5, 0.7, 0.9, 1.0]

    def simulate_model_generations(self, strategy: str, model_size: str) -> List[GenerationResult]:

        generations = []
        
        # Define possible route variations based on model size and strategy
        route_variations = self._get_route_variations(model_size, strategy)
        
        for i in range(self.num_generations):
            # Simulate temperature variation
            temp = self.temperature_settings[i % len(self.temperature_settings)]
            
            # Select route with probability based on model capability
            route = self._select_route_by_probability(route_variations, model_size, temp)
            
            # Generate reasoning quality based on model size
            reasoning = self._generate_reasoning(route, strategy, model_size)
            
            # Assign confidence score
            confidence = self._calculate_confidence(route, strategy, model_size)
            
            # Simulate response time
            response_time = random.uniform(0.5, 3.0)
            
            generations.append(GenerationResult(
                route=route,
                reasoning=reasoning,
                confidence_score=confidence,
                response_time=response_time
            ))
        
        return generations

    def _get_route_variations(self, model_size: str, strategy: str) -> Dict[str, float]:
        
        base_routes = {
            "optimal": ["Toronto", "Amsterdam", "Hamburg", "Warsaw", "Barcelona", "Delhi", "Shanghai"],
            "suboptimal_1": ["Amsterdam", "Toronto", "Warsaw", "Hamburg", "Barcelona", "Delhi", "Shanghai"],
            "suboptimal_2": ["Toronto", "Barcelona", "Amsterdam", "Hamburg", "Warsaw", "Delhi", "Shanghai"],
            "random_1": ["Shanghai", "Toronto", "Delhi", "Barcelona", "Amsterdam", "Warsaw", "Hamburg"],
            "random_2": ["Barcelona", "Warsaw", "Shanghai", "Toronto", "Delhi", "Amsterdam", "Hamburg"],
            "partial_correct": ["Toronto", "Amsterdam", "Warsaw", "Hamburg", "Barcelona", "Shanghai", "Delhi"],
        }
        
        # Probability distributions based on model size
        if model_size == "3B":
            return {
                "optimal": 0.2, "suboptimal_1": 0.3, "suboptimal_2": 0.2,
                "random_1": 0.15, "random_2": 0.1, "partial_correct": 0.05
            }
        elif model_size == "7B":
            return {
                "optimal": 0.4, "suboptimal_1": 0.25, "suboptimal_2": 0.15,
                "random_1": 0.1, "random_2": 0.05, "partial_correct": 0.05
            }
        elif model_size == "30B":
            return {
                "optimal": 0.6, "suboptimal_1": 0.2, "suboptimal_2": 0.1,
                "random_1": 0.05, "random_2": 0.03, "partial_correct": 0.02
            }
        else:  # Proprietary
            return {
                "optimal": 0.8, "suboptimal_1": 0.12, "suboptimal_2": 0.05,
                "random_1": 0.02, "random_2": 0.01, "partial_correct": 0.0
            }

    def _select_route_by_probability(self, route_variations: Dict[str, float], 
                                   model_size: str, temperature: float) -> List[str]:
        
        base_routes = {
            "optimal": ["Toronto", "Amsterdam", "Hamburg", "Warsaw", "Barcelona", "Delhi", "Shanghai"],
            "suboptimal_1": ["Amsterdam", "Toronto", "Warsaw", "Hamburg", "Barcelona", "Delhi", "Shanghai"],
            "suboptimal_2": ["Toronto", "Barcelona", "Amsterdam", "Hamburg", "Warsaw", "Delhi", "Shanghai"],
            "random_1": ["Shanghai", "Toronto", "Delhi", "Barcelona", "Amsterdam", "Warsaw", "Hamburg"],
            "random_2": ["Barcelona", "Warsaw", "Shanghai", "Toronto", "Delhi", "Amsterdam", "Hamburg"],
            "partial_correct": ["Toronto", "Amsterdam", "Warsaw", "Hamburg", "Barcelona", "Shanghai", "Delhi"],
        }
        
        # Adjust probabilities based on temperature
        adjusted_probs = {}
        for route_type, prob in route_variations.items():
            if temperature > 0.7:  # High temperature - more randomness
                adjusted_probs[route_type] = prob * (0.5 + temperature * 0.5)
            else:  # Low temperature - more deterministic
                adjusted_probs[route_type] = prob * (1.5 - temperature)
        
        # Normalize probabilities
        total_prob = sum(adjusted_probs.values())
        normalized_probs = {k: v/total_prob for k, v in adjusted_probs.items()}
        
        # Select route
        rand_val = random.random()
        cumulative_prob = 0
        for route_type, prob in normalized_probs.items():
            cumulative_prob += prob
            if rand_val <= cumulative_prob:
                return base_routes[route_type].copy()
        
        return base_routes["optimal"].copy()

    def _generate_reasoning(self, route: List[str], strategy: str, model_size: str) -> str:
        
        reasoning_templates = {
            "3B": [
                f"Route goes through {len(route)} cities.",
                f"Starting from {route[0]} and ending at {route[-1]}.",
                "This seems like a good order.",
                f"Europe cities: {[c for c in route if c in ['Amsterdam', 'Warsaw', 'Hamburg', 'Barcelona']]}."
            ],
            "7B": [
                f"Optimized route considers geographic proximity: {' → '.join(route)}.",
                f"Continental grouping: Europe first, then Asia, starting from {route[0]}.",
                "This minimizes backtracking between continents.",
                f"Total estimated segments: {len(route)-1} flights."
            ],
            "30B": [
                f"Analyzed route: {' → '.join(route)} optimizes for flight connections and time zones.",
                f"Geographic efficiency: Starting {route[0]} allows westward progression.",
                "Hub airport strategy utilized for European cities.",
                f"Estimated total travel time reduction: 15-20% vs random ordering."
            ],
            "proprietary": [
                f"Optimal touring sequence: {' → '.join(route)} based on comprehensive analysis.",
                f"Multi-factor optimization: flight times, hub connections, jet lag minimization.",
                f"Continental clustering strategy reduces intercontinental flights.",
                f"Quantitative analysis shows 18% improvement over alternative routes."
            ]
        }
        
        templates = reasoning_templates.get(model_size, reasoning_templates["7B"])
        return random.choice(templates)

    def _calculate_confidence(self, route: List[str], strategy: str, model_size: str) -> float:
        
        # Base confidence by model size
        base_confidence = {
            "3B": 0.4, "7B": 0.6, "30B": 0.8, "proprietary": 0.9
        }
        
        # Route quality assessment
        optimal_route = self.optimal_routes["geographical"]
        
        # Calculate similarity to optimal route
        similarity = self._calculate_route_similarity(route, optimal_route)
        
        # Adjust confidence based on similarity
        confidence = base_confidence[model_size] * (0.5 + 0.5 * similarity)
        
        # Add some randomness
        confidence += random.uniform(-0.1, 0.1)
        
        return max(0.1, min(1.0, confidence))

    def _calculate_route_similarity(self, route1: List[str], route2: List[str]) -> float:
        if len(route1) != len(route2):
            return 0.0
        
        # Position-based similarity
        position_matches = sum(1 for i, city in enumerate(route1) if i < len(route2) and city == route2[i])
        position_similarity = position_matches / len(route1)
        
        # Sequence similarity (adjacent pairs)
        pairs1 = [(route1[i], route1[i+1]) for i in range(len(route1)-1)]
        pairs2 = [(route2[i], route2[i+1]) for i in range(len(route2)-1)]
        common_pairs = len(set(pairs1) & set(pairs2))
        sequence_similarity = common_pairs / len(pairs1) if pairs1 else 0
        
        return (position_similarity + sequence_similarity) / 2

    def analyze_generations(self, generations: List[GenerationResult]) -> Dict[str, Any]:
        
        # Extract routes for analysis
        routes = [tuple(gen.route) for gen in generations]
        route_counter = Counter(routes)
        
        # Find most common route (majority vote)
        majority_route = route_counter.most_common(1)[0][0]
        majority_count = route_counter.most_common(1)[0][1]
        
        # Calculate diversity metrics
        unique_routes = len(route_counter)
        diversity_score = unique_routes / len(generations)
        
        # Confidence statistics
        confidences = [gen.confidence_score for gen in generations]
        avg_confidence = statistics.mean(confidences)
        confidence_std = statistics.stdev(confidences) if len(confidences) > 1 else 0
        
        # Quality assessment
        optimal_route = tuple(self.optimal_routes["geographical"])
        majority_is_optimal = majority_route == optimal_route
        
        # Calculate route quality scores
        quality_scores = []
        for gen in generations:
            similarity = self._calculate_route_similarity(gen.route, list(optimal_route))
            quality_scores.append(similarity)
        
        avg_quality = statistics.mean(quality_scores)
        best_quality = max(quality_scores)
        
        return {
            "majority_route": list(majority_route),
            "majority_count": majority_count,
            "majority_percentage": majority_count / len(generations),
            "unique_routes": unique_routes,
            "diversity_score": diversity_score,
            "majority_is_optimal": majority_is_optimal,
            "avg_confidence": avg_confidence,
            "confidence_std": confidence_std,
            "avg_quality": avg_quality,
            "best_quality": best_quality,
            "all_routes": dict(route_counter),
            "quality_scores": quality_scores
        }

    def run_comprehensive_analysis(self) -> Dict[str, Any]:
        
        strategies = ["zero_shot", "few_shot", "chain_of_thought", "adversarial"]
        models = ["3B", "7B", "30B", "proprietary"]
        
        results = {}
        
        print("🔄 Running Multiple Generation Analysis")
        print("="*50)
        
        for strategy in strategies:
            results[strategy] = {}
            print(f"\n📝 Strategy: {strategy.replace('_', ' ').title()}")
            
            for model in models:
                print(f"  🤖 Model: {model}")
                
                # Generate multiple outputs
                generations = self.simulate_model_generations(strategy, model)
                
                # Analyze the generations
                analysis = self.analyze_generations(generations)
                results[strategy][model] = analysis
                
                # Quick summary
                print(f"    • Majority vote: {analysis['majority_percentage']:.1%}")
                print(f"    • Route diversity: {analysis['unique_routes']} unique routes")
                print(f"    • Optimal route selected: {'✅' if analysis['majority_is_optimal'] else '❌'}")
        
        return results

    def evaluate_majority_voting_effectiveness(self, results: Dict[str, Any]) -> Dict[str, Any]:
        
        evaluation = {
            "overall_accuracy": 0,
            "by_strategy": {},
            "by_model": {},
            "consistency_analysis": {},
            "recommendations": []
        }
        
        total_tests = 0
        correct_majority_votes = 0
        
        # Analyze by strategy
        for strategy, strategy_results in results.items():
            strategy_correct = 0
            strategy_total = 0
            
            evaluation["by_strategy"][strategy] = {
                "accuracy": 0,
                "avg_diversity": 0,
                "avg_confidence": 0,
                "details": {}
            }
            
            for model, model_results in strategy_results.items():
                total_tests += 1
                strategy_total += 1
                
                if model_results["majority_is_optimal"]:
                    correct_majority_votes += 1
                    strategy_correct += 1
                
                # Store detailed results
                evaluation["by_strategy"][strategy]["details"][model] = {
                    "correct": model_results["majority_is_optimal"],
                    "majority_strength": model_results["majority_percentage"],
                    "diversity": model_results["diversity_score"],
                    "confidence": model_results["avg_confidence"]
                }
            
            evaluation["by_strategy"][strategy]["accuracy"] = strategy_correct / strategy_total
            
            # Calculate averages
            details = evaluation["by_strategy"][strategy]["details"]
            evaluation["by_strategy"][strategy]["avg_diversity"] = statistics.mean([d["diversity"] for d in details.values()])
            evaluation["by_strategy"][strategy]["avg_confidence"] = statistics.mean([d["confidence"] for d in details.values()])
        
        # Analyze by model
        model_stats = defaultdict(lambda: {"correct": 0, "total": 0, "diversities": [], "confidences": []})
        
        for strategy_results in results.values():
            for model, model_results in strategy_results.items():
                model_stats[model]["total"] += 1
                if model_results["majority_is_optimal"]:
                    model_stats[model]["correct"] += 1
                model_stats[model]["diversities"].append(model_results["diversity_score"])
                model_stats[model]["confidences"].append(model_results["avg_confidence"])
        
        for model, stats in model_stats.items():
            evaluation["by_model"][model] = {
                "accuracy": stats["correct"] / stats["total"],
                "avg_diversity": statistics.mean(stats["diversities"]),
                "avg_confidence": statistics.mean(stats["confidences"])
            }
        
        # Overall accuracy
        evaluation["overall_accuracy"] = correct_majority_votes / total_tests
        
        # Consistency analysis
        self._analyze_consistency(results, evaluation)
        
        # Generate recommendations
        self._generate_recommendations(evaluation)
        
        return evaluation

    def _analyze_consistency(self, results: Dict[str, Any], evaluation: Dict[str, Any]):
        
        consistency_patterns = {
            "high_diversity_low_accuracy": [],
            "low_diversity_high_accuracy": [],
            "confidence_accuracy_correlation": []
        }
        
        for strategy, strategy_results in results.items():
            for model, model_results in strategy_results.items():
                diversity = model_results["diversity_score"]
                accuracy = 1 if model_results["majority_is_optimal"] else 0
                confidence = model_results["avg_confidence"]
                
                if diversity > 0.7 and accuracy == 0:
                    consistency_patterns["high_diversity_low_accuracy"].append(f"{strategy}-{model}")
                
                if diversity < 0.3 and accuracy == 1:
                    consistency_patterns["low_diversity_high_accuracy"].append(f"{strategy}-{model}")
                
                consistency_patterns["confidence_accuracy_correlation"].append((confidence, accuracy))
        
        evaluation["consistency_analysis"] = consistency_patterns

    def _generate_recommendations(self, evaluation: Dict[str, Any]):        
        recommendations = []
        
        # Overall performance
        if evaluation["overall_accuracy"] > 0.7:
            recommendations.append("✅ Majority voting is generally effective for this task")
        else:
            recommendations.append("⚠️ Majority voting shows mixed results - consider alternative approaches")
        
        # Model-specific recommendations
        best_model = max(evaluation["by_model"].items(), key=lambda x: x[1]["accuracy"])
        recommendations.append(f"🏆 Best performing model: {best_model[0]} ({best_model[1]['accuracy']:.1%} accuracy)")
        
        # Strategy-specific recommendations
        best_strategy = max(evaluation["by_strategy"].items(), key=lambda x: x[1]["accuracy"])
        recommendations.append(f"🎯 Most effective strategy: {best_strategy[0]} ({best_strategy[1]['accuracy']:.1%} accuracy)")
        
        # Diversity insights
        high_diversity_strategies = [s for s, data in evaluation["by_strategy"].items() if data["avg_diversity"] > 0.5]
        if high_diversity_strategies:
            recommendations.append(f"🔄 High diversity strategies: {', '.join(high_diversity_strategies)} - may benefit from more generations")
        
        evaluation["recommendations"] = recommendations

    def display_detailed_results(self, results: Dict[str, Any], evaluation: Dict[str, Any]):
        
        print("\n" + "="*70)
        print("📊 DETAILED MAJORITY VOTING ANALYSIS")
        print("="*70)
        
        # Overall summary
        print(f"\n🎯 Overall Majority Voting Accuracy: {evaluation['overall_accuracy']:.1%}")
        print(f"📝 Total test scenarios: {sum(len(s) for s in results.values())}")
        
        # Strategy performance
        print(f"\n📈 Performance by Strategy:")
        for strategy, data in evaluation["by_strategy"].items():
            print(f"  {strategy.replace('_', ' ').title()}: {data['accuracy']:.1%} accuracy")
            print(f"    • Average diversity: {data['avg_diversity']:.2f}")
            print(f"    • Average confidence: {data['avg_confidence']:.2f}")
        
        # Model performance
        print(f"\n🤖 Performance by Model Size:")
        for model, data in evaluation["by_model"].items():
            print(f"  {model}: {data['accuracy']:.1%} accuracy")
            print(f"    • Average diversity: {data['avg_diversity']:.2f}")
            print(f"    • Average confidence: {data['avg_confidence']:.2f}")
        
        # Detailed breakdown
        print(f"\n📋 Detailed Results:")
        for strategy, strategy_results in results.items():
            print(f"\n  {strategy.replace('_', ' ').title()}:")
            for model, model_results in strategy_results.items():
                status = "✅" if model_results["majority_is_optimal"] else "❌"
                print(f"    {model}: {status} Majority: {model_results['majority_percentage']:.1%} " + 
                      f"({model_results['majority_count']}/{self.num_generations}) " +
                      f"Unique routes: {model_results['unique_routes']}")
        
        # Recommendations
        print(f"\n💡 Key Insights:")
        for rec in evaluation["recommendations"]:
            print(f"  {rec}")
        
        # Sample route variations
        print(f"\n🗺️ Sample Route Variations (Chain-of-thought, 30B model):")
        if "chain_of_thought" in results and "30B" in results["chain_of_thought"]:
            sample_routes = results["chain_of_thought"]["30B"]["all_routes"]
            for i, (route, count) in enumerate(list(sample_routes.items())[:3]):
                print(f"  Route {i+1} ({count}x): {' → '.join(route)}")

In [10]:
def main():    
    print("🎭 Multiple Generation Analysis with Majority Voting")
    print("Testing concert tour optimization across different prompting strategies")
    
    # Initialize analyzer
    analyzer = MultiGenerationAnalyzer(num_generations=10)
    
    # Run comprehensive analysis
    results = analyzer.run_comprehensive_analysis()
    
    # Evaluate majority voting effectiveness
    evaluation = analyzer.evaluate_majority_voting_effectiveness(results)
    
    # Display detailed results
    analyzer.display_detailed_results(results, evaluation)
    
    print(f"\n🎵 Analysis complete! Generated {analyzer.num_generations} responses per scenario.")
    print("Key finding: Majority voting effectiveness depends on model size and prompting strategy.")

In [11]:
if __name__ == "__main__":
    main()

🎭 Multiple Generation Analysis with Majority Voting
Testing concert tour optimization across different prompting strategies
🔄 Running Multiple Generation Analysis

📝 Strategy: Zero Shot
  🤖 Model: 3B
    • Majority vote: 40.0%
    • Route diversity: 5 unique routes
    • Optimal route selected: ❌
  🤖 Model: 7B
    • Majority vote: 30.0%
    • Route diversity: 4 unique routes
    • Optimal route selected: ✅
  🤖 Model: 30B
    • Majority vote: 30.0%
    • Route diversity: 5 unique routes
    • Optimal route selected: ❌
  🤖 Model: proprietary
    • Majority vote: 100.0%
    • Route diversity: 1 unique routes
    • Optimal route selected: ✅

📝 Strategy: Few Shot
  🤖 Model: 3B
    • Majority vote: 40.0%
    • Route diversity: 5 unique routes
    • Optimal route selected: ❌
  🤖 Model: 7B
    • Majority vote: 50.0%
    • Route diversity: 4 unique routes
    • Optimal route selected: ✅
  🤖 Model: 30B
    • Majority vote: 50.0%
    • Route diversity: 4 unique routes
    • Optimal route selected