# Model Fusion and Comparison Experiment

## Objective

Benchmark different model fusion strategies to empirically answer:
- **"Does weight merging outperform simple ensembling on our data?"**
- **"Which fusion strategy yields the best performance for our MVP?"**

## Strategies Evaluated

1. **Direct Ensemble**: Simple output averaging
2. **Weighted Average**: Optimized weighted combination
3. **Parameter Merging**: Direct model weight fusion

## Expected Outcomes

- Performance comparison across fusion strategies
- Computational efficiency analysis
- Recommendations for optimal model combination approach

In [None]:
# Import required libraries
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import time
import json
from typing import Dict, List, Tuple, Any, Callable
from dataclasses import dataclass
import sys

# Add project root to path
project_root = Path().resolve().parent if Path().resolve().name == 'experiments' else Path().resolve()
sys.path.insert(0, str(project_root))

# Import Symbio AI components
from models.merger import evolutionary_merge, MergeConfig, EvolutionaryModelMerger
from evaluation.benchmarks import BenchmarkRunner, AccuracyBenchmark
from monitoring.production import MetricsCollector, ProductionLogger

print("✅ Dependencies loaded successfully")
print(f"📁 Project root: {project_root}")
print(f"🔧 PyTorch version: {torch.__version__}")
print(f"🐍 Python version: {sys.version}")

## Experiment Setup: Model Fusion Strategies

Following the exact prompt specification, we define multiple fusion approaches:

In [None]:
# Experiment: Compare different model fusion strategies
strategies = {
    "Direct Ensemble": lambda outputs: sum(outputs)/len(outputs),      # average predictions
    "Weighted Average (0.7/0.3)": lambda outs: 0.7*outs[0] + 0.3*outs[1],  # weighted sum
    "Parameter Merging (50/50)": None,  # to be filled by loading merged model weights
}

print("🔬 Model Fusion Strategies Defined:")
for name, func in strategies.items():
    if func is not None:
        print(f"  ✓ {name}: Function-based fusion")
    else:
        print(f"  ⚙️ {name}: Parameter-level fusion (to be implemented)")

# Additional advanced strategies for comprehensive comparison
advanced_strategies = {
    "Weighted Average (0.8/0.2)": lambda outs: 0.8*outs[0] + 0.2*outs[1],
    "Weighted Average (0.6/0.4)": lambda outs: 0.6*outs[0] + 0.4*outs[1],
    "Max Voting": lambda outs: torch.max(torch.stack(outs), dim=0)[0],
    "Geometric Mean": lambda outs: torch.exp(torch.mean(torch.log(torch.stack(outs) + 1e-8), dim=0)),
    "Parameter Merging (30/70)": None,
    "Parameter Merging (70/30)": None,
}

# Combine strategies
all_strategies = {**strategies, **advanced_strategies}

print(f"\n📊 Total strategies to evaluate: {len(all_strategies)}")

## Model Architecture Definition

Create two base models with different architectures for realistic fusion testing:

In [None]:
class MathWordProblemModel(nn.Module):
    """Model architecture optimized for mathematical reasoning tasks."""
    
    def __init__(self, input_dim: int = 512, hidden_dim: int = 256, output_dim: int = 10, 
                 num_layers: int = 3, dropout: float = 0.1, model_variant: str = "A"):
        super().__init__()
        self.model_variant = model_variant
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        
        layers = []
        
        # Input layer
        layers.append(nn.Linear(input_dim, hidden_dim))
        layers.append(nn.ReLU())
        layers.append(nn.Dropout(dropout))
        
        # Hidden layers with different architectures for Model A vs Model B
        for i in range(num_layers - 1):
            if model_variant == "A":
                # Model A: Wider networks
                layers.append(nn.Linear(hidden_dim, hidden_dim * 2))
                layers.append(nn.ReLU())
                layers.append(nn.Linear(hidden_dim * 2, hidden_dim))
            else:
                # Model B: Deeper but narrower
                layers.append(nn.Linear(hidden_dim, hidden_dim))
                layers.append(nn.GELU())  # Different activation
                layers.append(nn.LayerNorm(hidden_dim))  # Add normalization
            
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
        
        # Output layer
        layers.append(nn.Linear(hidden_dim, output_dim))
        
        self.network = nn.Sequential(*layers)
        
        # Initialize weights differently for each variant
        self._initialize_weights()
    
    def _initialize_weights(self):
        """Initialize weights with different strategies for model variants."""
        for module in self.modules():
            if isinstance(module, nn.Linear):
                if self.model_variant == "A":
                    # Xavier initialization for Model A
                    nn.init.xavier_normal_(module.weight)
                else:
                    # He initialization for Model B
                    nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
                
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)
    
    def forward(self, x):
        return self.network(x)
    
    def get_model_info(self):
        """Get model architecture information."""
        total_params = sum(p.numel() for p in self.parameters())
        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
        
        return {
            "variant": self.model_variant,
            "total_parameters": total_params,
            "trainable_parameters": trainable_params,
            "input_dim": self.input_dim,
            "hidden_dim": self.hidden_dim,
            "output_dim": self.output_dim
        }

# Create model instances
print("🏗️ Creating base models...")

# Model A: Wider architecture, Xavier initialization
model_A = MathWordProblemModel(
    input_dim=512, 
    hidden_dim=256, 
    output_dim=10, 
    num_layers=3, 
    model_variant="A"
)

# Model B: Deeper architecture, He initialization, different activations
model_B = MathWordProblemModel(
    input_dim=512, 
    hidden_dim=256, 
    output_dim=10, 
    num_layers=4, 
    model_variant="B"
)

print(f"✅ Model A created: {model_A.get_model_info()}")
print(f"✅ Model B created: {model_B.get_model_info()}")

# Set models to evaluation mode
model_A.eval()
model_B.eval()

print("\n📊 Model Architecture Summary:")
print(f"Model A - Total params: {model_A.get_model_info()['total_parameters']:,}")
print(f"Model B - Total params: {model_B.get_model_info()['total_parameters']:,}")

## Data Generation and Task Definition

Create sample inputs and evaluation framework for mathematical reasoning tasks:

In [None]:
def load_sample_inputs(task: str = "math_word_problems", batch_size: int = 32, input_dim: int = 512) -> torch.Tensor:
    """
    Generate sample inputs for mathematical word problems.
    In a real scenario, this would load actual preprocessed text embeddings.
    """
    if task == "math_word_problems":
        # Simulate mathematical problem embeddings with realistic patterns
        # Numbers tend to be important in math problems, so we add some structure
        
        # Base random embeddings
        inputs = torch.randn(batch_size, input_dim) * 0.1
        
        # Add mathematical structure - certain dimensions represent numbers/operations
        # Dimensions 0-50: Number representations
        inputs[:, :50] = torch.randn(batch_size, 50) * 0.5 + torch.arange(50).float() * 0.01
        
        # Dimensions 51-100: Operation indicators (+, -, *, /)
        operation_patterns = torch.zeros(batch_size, 49)
        for i in range(batch_size):
            op_type = i % 4  # Cycle through operation types
            operation_patterns[i, op_type * 12:(op_type + 1) * 12] = 1.0
        inputs[:, 51:100] = operation_patterns
        
        # Dimensions 101-200: Context embeddings (word meanings)
        inputs[:, 101:200] = torch.randn(batch_size, 99) * 0.3
        
        # Remaining dimensions: General language features
        inputs[:, 200:] = torch.randn(batch_size, input_dim - 200) * 0.2
        
    else:
        # Generic task - standard random inputs
        inputs = torch.randn(batch_size, input_dim)
    
    return inputs


def generate_ground_truth(inputs: torch.Tensor, task: str = "math_word_problems") -> torch.Tensor:
    """
    Generate ground truth labels for evaluation.
    In reality, this would come from your dataset.
    """
    batch_size = inputs.shape[0]
    
    if task == "math_word_problems":
        # Generate labels based on input patterns
        # Use the mathematical structure we embedded in the inputs
        number_features = inputs[:, :50].mean(dim=1)
        operation_features = inputs[:, 51:100].argmax(dim=1)
        
        # Create structured labels (not completely random)
        labels = ((number_features * 10 + operation_features) % 10).long()
    else:
        # Random labels for generic tasks
        labels = torch.randint(0, 10, (batch_size,))
    
    return labels


def evaluate_output(predictions: torch.Tensor, targets: torch.Tensor = None, 
                   task: str = "math_word_problems") -> Dict[str, float]:
    """
    Evaluate model predictions against ground truth.
    Returns comprehensive metrics for comparison.
    """
    if targets is None:
        # Generate targets if not provided (for demonstration)
        # In real usage, targets would always be provided
        batch_size = predictions.shape[0]
        targets = torch.randint(0, predictions.shape[1], (batch_size,))
    
    # Convert logits to predictions
    if predictions.dim() > 1 and predictions.shape[1] > 1:
        pred_classes = torch.argmax(predictions, dim=1)
        probabilities = torch.softmax(predictions, dim=1)
        confidence = torch.max(probabilities, dim=1)[0]
    else:
        pred_classes = predictions.round().long().squeeze()
        confidence = torch.ones_like(pred_classes).float()
    
    # Calculate metrics
    accuracy = (pred_classes == targets).float().mean().item()
    avg_confidence = confidence.mean().item()
    
    # Additional metrics for comprehensive evaluation
    if predictions.dim() > 1:
        entropy = -torch.sum(probabilities * torch.log(probabilities + 1e-8), dim=1).mean().item()
        top_2_acc = sum((targets.unsqueeze(1) == torch.topk(predictions, 2)[1]).any(dim=1)).float() / len(targets)
    else:
        entropy = 0.0
        top_2_acc = accuracy
    
    return {
        "accuracy": accuracy,
        "confidence": avg_confidence,
        "entropy": entropy,
        "top_2_accuracy": float(top_2_acc),
        "sample_size": len(targets)
    }


# Generate sample data for experiments
print("📊 Generating sample data for experiments...")

# Sample evaluation on a task  
inputs = load_sample_inputs(task="math_word_problems", batch_size=64)
targets = generate_ground_truth(inputs, task="math_word_problems")

print(f"✅ Generated inputs: {inputs.shape}")
print(f"✅ Generated targets: {targets.shape}")
print(f"📈 Target distribution: {torch.bincount(targets)}")
print(f"🎯 Input statistics - Mean: {inputs.mean():.4f}, Std: {inputs.std():.4f}")

## Model Evaluation: Individual Performance

First, evaluate each base model individually to establish baseline performance:

In [None]:
print("🔍 Evaluating individual model performance...")

# Get individual model outputs
with torch.no_grad():
    outputs_A = model_A(inputs)
    outputs_B = model_B(inputs)

# Evaluate individual models
results_A = evaluate_output(outputs_A, targets, task="math_word_problems")
results_B = evaluate_output(outputs_B, targets, task="math_word_problems")

print("\n📊 Individual Model Performance:")
print("=" * 50)

print(f"\n🤖 Model A Results:")
for metric, value in results_A.items():
    if isinstance(value, float):
        print(f"  {metric:.<20}: {value:.4f}")
    else:
        print(f"  {metric:.<20}: {value}")

print(f"\n🤖 Model B Results:")
for metric, value in results_B.items():
    if isinstance(value, float):
        print(f"  {metric:.<20}: {value:.4f}")
    else:
        print(f"  {metric:.<20}: {value}")

# Determine which model performs better
better_model = "A" if results_A['accuracy'] > results_B['accuracy'] else "B"
performance_gap = abs(results_A['accuracy'] - results_B['accuracy'])

print(f"\n🏆 Better performing model: Model {better_model}")
print(f"📈 Performance gap: {performance_gap:.4f} ({performance_gap*100:.2f}%)")

# Store outputs for fusion experiments
outputs_ens = [outputs_A, outputs_B]

## Parameter Merging Implementation

Create merged models using different parameter fusion ratios:

In [None]:
def merge_models(model_a: nn.Module, model_b: nn.Module, alpha: float = 0.5) -> nn.Module:
    """
    Merge two models using linear interpolation of parameters.
    
    Args:
        model_a: First model
        model_b: Second model  
        alpha: Mixing ratio (0.5 = equal mix, 0.7 = 70% model_a + 30% model_b)
    
    Returns:
        Merged model with interpolated parameters
    """
    # Create a new model with the same architecture as model_a
    merged_model = MathWordProblemModel(
        input_dim=model_a.input_dim,
        hidden_dim=model_a.hidden_dim, 
        output_dim=model_a.output_dim,
        model_variant="Merged"
    )
    
    # Merge parameters
    merged_state_dict = {}
    model_a_state = model_a.state_dict()
    model_b_state = model_b.state_dict()
    
    for key in model_a_state.keys():
        if key in model_b_state:
            # Linear interpolation of parameters
            merged_state_dict[key] = alpha * model_a_state[key] + (1 - alpha) * model_b_state[key]
        else:
            # If parameter doesn't exist in model_b, use model_a's parameter
            merged_state_dict[key] = model_a_state[key]
    
    # Handle parameters that exist only in model_b
    for key in model_b_state.keys():
        if key not in merged_state_dict:
            merged_state_dict[key] = model_b_state[key]
    
    merged_model.load_state_dict(merged_state_dict, strict=False)
    merged_model.eval()
    
    return merged_model


print("⚙️ Creating parameter-merged models...")

# Create merged models with different ratios
merged_models = {}
merge_ratios = [0.5, 0.3, 0.7]  # 50/50, 30/70, 70/30

for alpha in merge_ratios:
    print(f"  🔧 Creating merged model with ratio {alpha:.1f}/{1-alpha:.1f}...")
    merged_model = merge_models(model_A, model_B, alpha=alpha)
    
    # Update strategy mapping
    ratio_str = f"({int(alpha*100)}/{int((1-alpha)*100)})"
    strategy_name = f"Parameter Merging {ratio_str}"
    merged_models[strategy_name] = merged_model
    
    print(f"    ✅ {strategy_name} model created")

# Update strategies dictionary with merged models
for strategy_name, merged_model in merged_models.items():
    if strategy_name in all_strategies:
        # Create a lambda that captures the merged model
        all_strategies[strategy_name] = lambda inputs, model=merged_model: model(inputs)

print(f"\n✅ Created {len(merged_models)} parameter-merged models")
print(f"📊 Total fusion strategies ready: {sum(1 for v in all_strategies.values() if v is not None)}")

## Comprehensive Fusion Strategy Evaluation

Evaluate all fusion strategies following the exact prompt structure:

In [None]:
# Main evaluation loop following the exact prompt structure
print("🧪 Running comprehensive fusion strategy evaluation...")
print("=" * 60)

results = {}
timing_results = {}
detailed_metrics = {}

for name, func in all_strategies.items():
    print(f"\n🔬 Evaluating: {name}")
    
    start_time = time.time()
    
    try:
        with torch.no_grad():
            if "Parameter Merging" in name:
                # Parameter merging strategies
                if name in merged_models:
                    merged_out = merged_models[name](inputs)
                    evaluation_result = evaluate_output(merged_out, targets, task="math_word_problems")
                else:
                    print(f"  ⚠️ Merged model not found for {name}, skipping...")
                    continue
            else:
                # Output-level fusion strategies
                if func is not None:
                    combined = func(outputs_ens)
                    evaluation_result = evaluate_output(combined, targets, task="math_word_problems")
                else:
                    print(f"  ⚠️ Function not implemented for {name}, skipping...")
                    continue
        
        end_time = time.time()
        execution_time = end_time - start_time
        
        # Store results
        results[name] = evaluation_result['accuracy']
        timing_results[name] = execution_time
        detailed_metrics[name] = evaluation_result
        
        print(f"  ✅ Accuracy: {evaluation_result['accuracy']:.4f}")
        print(f"  ⏱️ Time: {execution_time:.6f}s")
        print(f"  🎯 Confidence: {evaluation_result['confidence']:.4f}")
        
    except Exception as e:
        print(f"  ❌ Error evaluating {name}: {str(e)}")
        continue

print(f"\n🎉 Evaluation completed for {len(results)} strategies")

## Results Analysis and Visualization

Comprehensive analysis of fusion strategy performance:

In [None]:
# Print comprehensive results
print("\n" + "=" * 80)
print("🏆 FUSION STRATEGY COMPARISON RESULTS")
print("=" * 80)

# Sort results by accuracy
sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)

print(f"\n📊 Performance Ranking (Top to Bottom):")
print("-" * 60)

for rank, (strategy, accuracy) in enumerate(sorted_results, 1):
    timing = timing_results.get(strategy, 0)
    metrics = detailed_metrics.get(strategy, {})
    
    print(f"{rank:2d}. {strategy:<35} | Acc: {accuracy:.4f} | Time: {timing:.4f}s")
    if metrics:
        print(f"     Confidence: {metrics.get('confidence', 0):.4f} | "
              f"Top-2 Acc: {metrics.get('top_2_accuracy', 0):.4f} | "
              f"Entropy: {metrics.get('entropy', 0):.4f}")

# Statistical analysis
accuracies = list(results.values())
best_accuracy = max(accuracies)
worst_accuracy = min(accuracies)
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)

print(f"\n📈 Statistical Summary:")
print("-" * 30)
print(f"Best Performance:     {best_accuracy:.4f} ({sorted_results[0][0]})")
print(f"Worst Performance:    {worst_accuracy:.4f} ({sorted_results[-1][0]})")
print(f"Mean Performance:     {mean_accuracy:.4f}")
print(f"Standard Deviation:   {std_accuracy:.4f}")
print(f"Performance Range:    {best_accuracy - worst_accuracy:.4f}")

# Compare with individual models
print(f"\n🤖 Comparison with Individual Models:")
print("-" * 40)
print(f"Model A Accuracy:     {results_A['accuracy']:.4f}")
print(f"Model B Accuracy:     {results_B['accuracy']:.4f}")
print(f"Best Fusion:          {best_accuracy:.4f}")

improvement_over_best = best_accuracy - max(results_A['accuracy'], results_B['accuracy'])
improvement_over_avg = best_accuracy - (results_A['accuracy'] + results_B['accuracy']) / 2

print(f"Improvement over best individual: {improvement_over_best:.4f} ({improvement_over_best*100:.2f}%)")
print(f"Improvement over average:        {improvement_over_avg:.4f} ({improvement_over_avg*100:.2f}%)")

In [None]:
# Create comprehensive visualizations
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Model Fusion Strategy Comparison Analysis', fontsize=16, fontweight='bold')

# 1. Performance comparison bar chart
strategies_list = list(results.keys())
accuracies_list = list(results.values())
timings_list = [timing_results.get(s, 0) for s in strategies_list]

# Sort by accuracy for better visualization
sorted_indices = np.argsort(accuracies_list)[::-1]
sorted_strategies = [strategies_list[i] for i in sorted_indices]
sorted_accuracies = [accuracies_list[i] for i in sorted_indices]

bars1 = ax1.bar(range(len(sorted_strategies)), sorted_accuracies, 
                color=['gold' if i == 0 else 'skyblue' for i in range(len(sorted_strategies))])
ax1.set_title('Fusion Strategy Performance Comparison', fontweight='bold')
ax1.set_xlabel('Fusion Strategy')
ax1.set_ylabel('Accuracy')
ax1.set_xticks(range(len(sorted_strategies)))
ax1.set_xticklabels([s.replace(' ', '\n') for s in sorted_strategies], rotation=45, ha='right')
ax1.grid(True, alpha=0.3)

# Add accuracy values on bars
for i, (bar, acc) in enumerate(zip(bars1, sorted_accuracies)):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.002, 
             f'{acc:.3f}', ha='center', va='bottom', fontsize=8)

# 2. Performance vs Timing scatter plot
colors = ['red' if 'Parameter Merging' in s else 'blue' for s in strategies_list]
scatter = ax2.scatter(timings_list, accuracies_list, c=colors, alpha=0.7, s=60)
ax2.set_title('Performance vs Execution Time', fontweight='bold')
ax2.set_xlabel('Execution Time (seconds)')
ax2.set_ylabel('Accuracy')
ax2.grid(True, alpha=0.3)

# Add legend for parameter merging vs output fusion
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='red', alpha=0.7, label='Parameter Merging'),
                  Patch(facecolor='blue', alpha=0.7, label='Output Fusion')]
ax2.legend(handles=legend_elements, loc='best')

# Annotate best performing points
for i, (strategy, acc) in enumerate(zip(strategies_list, accuracies_list)):
    if acc > np.percentile(accuracies_list, 80):  # Top 20% performers
        ax2.annotate(strategy.split()[0], (timings_list[i], acc), 
                    xytext=(5, 5), textcoords='offset points', fontsize=8)

# 3. Confidence vs Accuracy analysis
confidences = [detailed_metrics.get(s, {}).get('confidence', 0) for s in strategies_list]
ax3.scatter(confidences, accuracies_list, alpha=0.7, s=60)
ax3.set_title('Model Confidence vs Accuracy', fontweight='bold')
ax3.set_xlabel('Average Prediction Confidence')
ax3.set_ylabel('Accuracy')
ax3.grid(True, alpha=0.3)

# Add trend line
if len(confidences) > 1:
    z = np.polyfit(confidences, accuracies_list, 1)
    p = np.poly1d(z)
    ax3.plot(sorted(confidences), p(sorted(confidences)), "r--", alpha=0.8)

# 4. Strategy type comparison
strategy_types = {
    'Output Fusion': [],
    'Parameter Merging': []
}

for strategy, acc in results.items():
    if 'Parameter Merging' in strategy:
        strategy_types['Parameter Merging'].append(acc)
    else:
        strategy_types['Output Fusion'].append(acc)

type_names = list(strategy_types.keys())
type_means = [np.mean(strategy_types[t]) if strategy_types[t] else 0 for t in type_names]
type_stds = [np.std(strategy_types[t]) if len(strategy_types[t]) > 1 else 0 for t in type_names]

bars4 = ax4.bar(type_names, type_means, yerr=type_stds, capsize=5, 
                color=['lightcoral', 'lightblue'], alpha=0.7)
ax4.set_title('Strategy Type Comparison', fontweight='bold')
ax4.set_ylabel('Mean Accuracy')
ax4.grid(True, alpha=0.3)

# Add mean values on bars
for bar, mean in zip(bars4, type_means):
    ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005, 
             f'{mean:.3f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("📊 Visualization complete!")

## Key Insights and Recommendations

Analysis of fusion strategy performance to guide MVP development:

In [None]:
print("\n" + "=" * 80)
print("🎯 KEY INSIGHTS AND RECOMMENDATIONS FOR MVP")
print("=" * 80)

# Analyze results to provide actionable insights
output_fusion_results = {k: v for k, v in results.items() if 'Parameter Merging' not in k}
parameter_merging_results = {k: v for k, v in results.items() if 'Parameter Merging' in k}

best_output_fusion = max(output_fusion_results.items(), key=lambda x: x[1]) if output_fusion_results else ("None", 0)
best_parameter_merging = max(parameter_merging_results.items(), key=lambda x: x[1]) if parameter_merging_results else ("None", 0)
overall_best = max(results.items(), key=lambda x: x[1])

print(f"\n🏆 PERFORMANCE WINNERS:")
print("-" * 40)
print(f"Overall Best:           {overall_best[0]} ({overall_best[1]:.4f})")
print(f"Best Output Fusion:     {best_output_fusion[0]} ({best_output_fusion[1]:.4f})")
print(f"Best Parameter Merging: {best_parameter_merging[0]} ({best_parameter_merging[1]:.4f})")

# Timing analysis
fastest_strategy = min(timing_results.items(), key=lambda x: x[1])
slowest_strategy = max(timing_results.items(), key=lambda x: x[1])

print(f"\n⚡ TIMING ANALYSIS:")
print("-" * 25)
print(f"Fastest Strategy:  {fastest_strategy[0]} ({fastest_strategy[1]:.6f}s)")
print(f"Slowest Strategy:  {slowest_strategy[0]} ({slowest_strategy[1]:.6f}s)")
print(f"Speed Difference:  {slowest_strategy[1] / fastest_strategy[1]:.1f}x")

# MVP Recommendations
print(f"\n🚀 MVP DEVELOPMENT RECOMMENDATIONS:")
print("-" * 45)

# Determine best approach
if best_parameter_merging[1] > best_output_fusion[1]:
    recommendation = "Parameter Merging"
    reason = f"Parameter merging achieved {best_parameter_merging[1]:.4f} vs {best_output_fusion[1]:.4f} for output fusion"
else:
    recommendation = "Output Fusion"
    reason = f"Output fusion achieved {best_output_fusion[1]:.4f} vs {best_parameter_merging[1]:.4f} for parameter merging"

print(f"1. 🎯 PRIMARY STRATEGY: {recommendation}")
print(f"   Rationale: {reason}")

# Efficiency consideration
output_fusion_times = [timing_results[k] for k in output_fusion_results.keys()]
parameter_merging_times = [timing_results[k] for k in parameter_merging_results.keys()]

avg_output_time = np.mean(output_fusion_times) if output_fusion_times else 0
avg_param_time = np.mean(parameter_merging_times) if parameter_merging_times else 0

if avg_output_time < avg_param_time:
    efficiency_winner = "Output Fusion"
    efficiency_factor = avg_param_time / avg_output_time if avg_output_time > 0 else 1
else:
    efficiency_winner = "Parameter Merging" 
    efficiency_factor = avg_output_time / avg_param_time if avg_param_time > 0 else 1

print(f"\n2. ⚡ EFFICIENCY CHAMPION: {efficiency_winner}")
print(f"   Speed advantage: {efficiency_factor:.1f}x faster on average")

# Robustness analysis
output_fusion_std = np.std(list(output_fusion_results.values())) if output_fusion_results else 0
parameter_merging_std = np.std(list(parameter_merging_results.values())) if parameter_merging_results else 0

print(f"\n3. 🛡️ ROBUSTNESS ANALYSIS:")
print(f"   Output Fusion Variance:     {output_fusion_std:.4f}")
print(f"   Parameter Merging Variance: {parameter_merging_std:.4f}")

if output_fusion_std < parameter_merging_std:
    print(f"   → Output fusion shows more consistent results")
else:
    print(f"   → Parameter merging shows more consistent results")

print(f"\n4. 🎛️ OPTIMAL CONFIGURATIONS:")
print(f"   Best Weighted Average: {max((k,v) for k,v in results.items() if 'Weighted Average' in k)[0]}")
print(f"   Best Parameter Ratio:  {max((k,v) for k,v in results.items() if 'Parameter Merging' in k)[0]}")

print(f"\n5. 💡 IMPLEMENTATION STRATEGY:")
print(f"   • Start with {overall_best[0]} as primary fusion method")
print(f"   • Implement {fastest_strategy[0]} for latency-critical scenarios")
print(f"   • Consider ensemble of top 3 performers for maximum robustness")
print(f"   • A/B test between parameter merging vs output fusion in production")

# Final verdict
improvement_vs_individual = overall_best[1] - max(results_A['accuracy'], results_B['accuracy'])

print(f"\n✅ EXPERIMENT CONCLUSION:")
print("-" * 30)
if improvement_vs_individual > 0:
    print(f"🎉 Model fusion IS beneficial! Best fusion improves performance by {improvement_vs_individual:.4f} ({improvement_vs_individual*100:.2f}%)")
    print(f"📈 Recommended approach: {overall_best[0]}")
else:
    print(f"⚠️ Model fusion shows minimal benefit. Individual models may be sufficient.")
    print(f"🤔 Consider: Better base models, different fusion techniques, or task-specific optimization")

print(f"\n🔬 Next steps: Scale experiment to larger datasets and production workloads")

## Export Results for Further Analysis

Save experimental results for documentation and future reference:

In [None]:
# Create comprehensive results summary
experiment_summary = {
    "experiment_info": {
        "date": str(pd.Timestamp.now()),
        "task": "math_word_problems", 
        "sample_size": len(targets),
        "input_dimension": inputs.shape[1],
        "num_strategies": len(results)
    },
    "model_architectures": {
        "model_a": model_A.get_model_info(),
        "model_b": model_B.get_model_info()
    },
    "individual_performance": {
        "model_a": results_A,
        "model_b": results_B
    },
    "fusion_results": {
        strategy: {
            "accuracy": results[strategy],
            "execution_time": timing_results.get(strategy, 0),
            "detailed_metrics": detailed_metrics.get(strategy, {})
        }
        for strategy in results.keys()
    },
    "analysis": {
        "best_overall": overall_best[0],
        "best_overall_accuracy": overall_best[1],
        "best_output_fusion": best_output_fusion[0],
        "best_parameter_merging": best_parameter_merging[0],
        "improvement_over_individual": improvement_vs_individual,
        "recommendation": recommendation
    }
}

# Save to JSON file
results_file = project_root / "experiments" / "fusion_experiment_results.json"
results_file.parent.mkdir(exist_ok=True)

with open(results_file, 'w') as f:
    json.dump(experiment_summary, f, indent=2, default=str)

# Create DataFrame for easy analysis
results_df = pd.DataFrame({
    'Strategy': list(results.keys()),
    'Accuracy': list(results.values()),
    'Execution_Time': [timing_results.get(s, 0) for s in results.keys()],
    'Confidence': [detailed_metrics.get(s, {}).get('confidence', 0) for s in results.keys()],
    'Type': ['Parameter Merging' if 'Parameter Merging' in s else 'Output Fusion' for s in results.keys()]
})

# Save DataFrame
csv_file = project_root / "experiments" / "fusion_experiment_results.csv"
results_df.to_csv(csv_file, index=False)

print(f"💾 Experiment results saved:")
print(f"  📄 JSON: {results_file}")
print(f"  📊 CSV:  {csv_file}")

# Display final summary table
print(f"\n📋 FINAL RESULTS SUMMARY:")
print(results_df.sort_values('Accuracy', ascending=False).to_string(index=False, float_format='{:.4f}'.format))

print(f"\n🎊 Model Fusion Comparison Experiment Complete!")
print(f"✅ Successfully benchmarked {len(results)} fusion strategies")
print(f"🏆 Winner: {overall_best[0]} with {overall_best[1]:.4f} accuracy")
print(f"📈 Performance improvement: {improvement_vs_individual*100:.2f}% over best individual model")