# Lab-1.2: PyTorch DDP 分散式訓練基礎 - 03-Optimization
## 通訊優化與性能調優

---

## ⚠️ 注意事項

本notebook專注於**DDP通訊優化的理論和實作技巧**，在單GPU環境中演示優化配置。
- ✅ **可學習**: 通訊優化策略、性能調優技巧、配置最佳實踐
- ⚠️ **限制**: 無法測量真實的多GPU通訊性能

---

## 📚 學習目標

1. 理解DDP通訊瓶頸和優化策略
2. 掌握梯度累積和混合精度訓練
3. 學習通訊與計算重疊技術
4. 實作性能監控和調優工具

## 1. 載入基礎設置

In [None]:
import torch
import torch.nn as nn
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.utils.data as data
from torch.cuda.amp import autocast, GradScaler

import os
import time
import json
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from dataclasses import dataclass
from typing import Dict, List, Optional
import psutil
import threading

print(f"PyTorch 版本: {torch.__version__}")
print(f"CUDA 可用: {torch.cuda.is_available()}")
print(f"GPU 數量: {torch.cuda.device_count()}")
print(f"分散式支援: {torch.distributed.is_available()}")
print(f"NCCL 支援: {torch.distributed.is_nccl_available()}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用設備: {device}")

## 2. 通訊分析工具

In [None]:
@dataclass
class CommunicationProfile:
    """通訊性能分析結果"""
    operation: str
    data_size_mb: float
    time_ms: float
    bandwidth_gbps: float
    gpu_count: int
    backend: str

class DDPCommunicationAnalyzer:
    """
    DDP通訊分析器
    分析和優化分散式訓練中的通訊模式
    """
    
    def __init__(self, model, device, world_size=1):
        self.model = model
        self.device = device
        self.world_size = world_size
        self.profiles = []
        
        # 計算模型通訊量
        self.model_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        self.gradient_size_mb = self.model_params * 4 / (1024 * 1024)  # FP32
        self.gradient_size_mb_fp16 = self.model_params * 2 / (1024 * 1024)  # FP16
        
        print(f"=== DDP 通訊分析器初始化 ===")
        print(f"模型參數量: {self.model_params:,}")
        print(f"梯度大小 (FP32): {self.gradient_size_mb:.2f} MB")
        print(f"梯度大小 (FP16): {self.gradient_size_mb_fp16:.2f} MB")
        print(f"World Size: {self.world_size}")
    
    def analyze_communication_patterns(self):
        """分析不同通訊模式的特性"""
        print("\n=== 通訊模式分析 ===")
        
        patterns = {
            'All-Reduce (Ring)': {
                'description': 'Ring拓撲的All-Reduce算法',
                'steps': self.world_size - 1,
                'data_per_step': self.gradient_size_mb / self.world_size,
                'total_data': self.gradient_size_mb * (self.world_size - 1) / self.world_size,
                'latency_factor': self.world_size - 1
            },
            'All-Reduce (Tree)': {
                'description': 'Tree拓撲的All-Reduce算法',
                'steps': 2 * int(np.log2(self.world_size)),
                'data_per_step': self.gradient_size_mb,
                'total_data': self.gradient_size_mb * 2 * int(np.log2(self.world_size)),
                'latency_factor': 2 * int(np.log2(self.world_size))
            },
            'Parameter Server': {
                'description': '參數服務器模式',
                'steps': 2,  # 收集 + 廣播
                'data_per_step': self.gradient_size_mb * (self.world_size - 1),
                'total_data': self.gradient_size_mb * 2 * (self.world_size - 1),
                'latency_factor': 2
            }
        }
        
        for name, pattern in patterns.items():
            print(f"\n{name}:")
            print(f"  描述: {pattern['description']}")
            print(f"  通訊步數: {pattern['steps']}")
            print(f"  每步數據量: {pattern['data_per_step']:.2f} MB")
            print(f"  總數據量: {pattern['total_data']:.2f} MB")
            print(f"  延遲係數: {pattern['latency_factor']}")
            
            # 計算理論性能
            if self.world_size > 1:
                efficiency = 1.0 / pattern['latency_factor']
                print(f"  理論效率: {efficiency:.3f}")
        
        return patterns
    
    def simulate_communication_overhead(self, bandwidth_gbps=10.0, latency_us=5.0):
        """模擬通訊開銷"""
        print(f"\n=== 通訊開銷模擬 ===")
        print(f"假設條件: 帶寬 {bandwidth_gbps} Gbps, 延遲 {latency_us} μs")
        
        # 不同GPU數量的通訊開銷
        gpu_counts = [2, 4, 8, 16, 32] if self.world_size == 1 else [self.world_size]
        
        results = {}
        for gpu_count in gpu_counts:
            # Ring All-Reduce
            chunk_size_mb = self.gradient_size_mb / gpu_count
            transfer_time_ms = (chunk_size_mb * 8) / bandwidth_gbps  # 轉換為毫秒
            latency_time_ms = latency_us * (gpu_count - 1) / 1000
            total_time_ms = transfer_time_ms + latency_time_ms
            
            results[gpu_count] = {
                'transfer_time_ms': transfer_time_ms,
                'latency_time_ms': latency_time_ms,
                'total_time_ms': total_time_ms,
                'effective_bandwidth_gbps': (self.gradient_size_mb * 8) / (total_time_ms / 1000)
            }
            
            print(f"\n{gpu_count} GPU:")
            print(f"  數據傳輸時間: {transfer_time_ms:.2f} ms")
            print(f"  延遲時間: {latency_time_ms:.2f} ms")
            print(f"  總通訊時間: {total_time_ms:.2f} ms")
            print(f"  有效帶寬: {results[gpu_count]['effective_bandwidth_gbps']:.2f} Gbps")
        
        return results
    
    def analyze_gradient_compression(self):
        """分析梯度壓縮技術"""
        print(f"\n=== 梯度壓縮分析 ===")
        
        compression_methods = {
            'FP32 (無壓縮)': {
                'bits_per_param': 32,
                'compression_ratio': 1.0,
                'accuracy_loss': 0.0
            },
            'FP16': {
                'bits_per_param': 16,
                'compression_ratio': 2.0,
                'accuracy_loss': 0.01
            },
            'BF16': {
                'bits_per_param': 16,
                'compression_ratio': 2.0,
                'accuracy_loss': 0.005
            },
            'INT8 量化': {
                'bits_per_param': 8,
                'compression_ratio': 4.0,
                'accuracy_loss': 0.02
            },
            'Top-K稀疏化': {
                'bits_per_param': 32,  # 保持精度
                'compression_ratio': 10.0,  # 只傳輸10%
                'accuracy_loss': 0.03
            },
            '隨機量化': {
                'bits_per_param': 1,  # 極端壓縮
                'compression_ratio': 32.0,
                'accuracy_loss': 0.05
            }
        }
        
        for method, props in compression_methods.items():
            compressed_size = self.gradient_size_mb / props['compression_ratio']
            bandwidth_savings = (1 - 1/props['compression_ratio']) * 100
            
            print(f"\n{method}:")
            print(f"  每參數位數: {props['bits_per_param']}")
            print(f"  壓縮比: {props['compression_ratio']:.1f}x")
            print(f"  壓縮後大小: {compressed_size:.2f} MB")
            print(f"  帶寬節省: {bandwidth_savings:.1f}%")
            print(f"  精度損失: {props['accuracy_loss']:.1%}")
        
        return compression_methods

# 創建通訊分析器
# 使用簡單模型進行演示
demo_model = nn.Sequential(
    nn.Linear(1024, 512),
    nn.ReLU(),
    nn.Linear(512, 256),
    nn.ReLU(),
    nn.Linear(256, 10)
).to(device)

analyzer = DDPCommunicationAnalyzer(demo_model, device, world_size=4)

# 執行分析
communication_patterns = analyzer.analyze_communication_patterns()
communication_overhead = analyzer.simulate_communication_overhead()
compression_analysis = analyzer.analyze_gradient_compression()

## 3. 梯度累積優化

In [None]:
class GradientAccumulationOptimizer:
    """
    梯度累積優化器
    實現高效的梯度累積和同步策略
    """
    
    def __init__(self, model, optimizer, accumulation_steps=4, sync_every_n_steps=None):
        self.model = model
        self.optimizer = optimizer
        self.accumulation_steps = accumulation_steps
        self.sync_every_n_steps = sync_every_n_steps or accumulation_steps
        
        self.step_count = 0
        self.accumulated_gradients = 0
        
        # 性能統計
        self.sync_times = []
        self.compute_times = []
        
        print(f"=== 梯度累積優化器 ===")
        print(f"累積步數: {self.accumulation_steps}")
        print(f"同步間隔: {self.sync_every_n_steps}")
        print(f"有效批次大小倍數: {self.accumulation_steps}x")
    
    def zero_grad(self):
        """清零梯度"""
        if self.accumulated_gradients == 0:
            self.optimizer.zero_grad()
    
    def backward(self, loss):
        """反向傳播"""
        start_time = time.time()
        
        # 縮放損失
        scaled_loss = loss / self.accumulation_steps
        scaled_loss.backward()
        
        self.accumulated_gradients += 1
        compute_time = time.time() - start_time
        self.compute_times.append(compute_time * 1000)  # 轉換為毫秒
        
        return scaled_loss.item()
    
    def step(self, clip_grad_norm=None):
        """優化器步驟"""
        if self.accumulated_gradients >= self.accumulation_steps:
            start_time = time.time()
            
            # 梯度裁剪
            if clip_grad_norm is not None:
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), clip_grad_norm)
            
            # 參數更新
            self.optimizer.step()
            self.optimizer.zero_grad()
            
            sync_time = time.time() - start_time
            self.sync_times.append(sync_time * 1000)  # 轉換為毫秒
            
            self.accumulated_gradients = 0
            self.step_count += 1
            
            return True
        return False
    
    def get_performance_stats(self):
        """獲取性能統計"""
        stats = {
            'avg_compute_time_ms': np.mean(self.compute_times) if self.compute_times else 0,
            'avg_sync_time_ms': np.mean(self.sync_times) if self.sync_times else 0,
            'total_steps': self.step_count,
            'compute_sync_ratio': 0
        }
        
        if stats['avg_sync_time_ms'] > 0:
            stats['compute_sync_ratio'] = stats['avg_compute_time_ms'] / stats['avg_sync_time_ms']
        
        return stats

def analyze_gradient_accumulation_benefits():
    """分析梯度累積的優勢"""
    print("\n=== 梯度累積優勢分析 ===")
    
    base_batch_size = 8
    accumulation_steps_list = [1, 2, 4, 8, 16]
    
    for acc_steps in accumulation_steps_list:
        effective_batch_size = base_batch_size * acc_steps
        communication_reduction = (acc_steps - 1) / acc_steps * 100
        memory_efficiency = 1.0  # 記憶體使用不變
        
        print(f"\n累積步數 {acc_steps}:")
        print(f"  有效批次大小: {effective_batch_size}")
        print(f"  通訊減少: {communication_reduction:.1f}%")
        print(f"  記憶體效率: {memory_efficiency:.1f}x")
        print(f"  同步頻率: 1/{acc_steps}")
        
        # 理論加速比
        if acc_steps > 1:
            speedup = acc_steps / (1 + 0.1 * (acc_steps - 1))  # 假設10%的開銷
            print(f"  理論加速比: {speedup:.2f}x")
    
    print(f"\n💡 關鍵優勢:")
    print(f"  1. 減少通訊頻率 → 降低網絡開銷")
    print(f"  2. 增大有效批次 → 改善訓練穩定性")
    print(f"  3. 保持記憶體使用 → 適合記憶體受限環境")
    print(f"  4. 更好的計算通訊比 → 提升整體效率")

# 演示梯度累積
demo_optimizer = torch.optim.AdamW(demo_model.parameters(), lr=1e-3)
grad_acc_optimizer = GradientAccumulationOptimizer(
    model=demo_model,
    optimizer=demo_optimizer,
    accumulation_steps=4
)

# 模擬訓練步驟
print("\n=== 梯度累積演示 ===")
demo_input = torch.randn(8, 1024).to(device)
demo_target = torch.randint(0, 10, (8,)).to(device)
criterion = nn.CrossEntropyLoss()

for step in range(8):
    grad_acc_optimizer.zero_grad()
    
    output = demo_model(demo_input)
    loss = criterion(output, demo_target)
    
    scaled_loss = grad_acc_optimizer.backward(loss)
    step_taken = grad_acc_optimizer.step(clip_grad_norm=1.0)
    
    print(f"步驟 {step+1}: 損失 {scaled_loss:.4f}, 參數更新: {step_taken}")

# 性能統計
perf_stats = grad_acc_optimizer.get_performance_stats()
print(f"\n性能統計:")
for key, value in perf_stats.items():
    if 'time' in key:
        print(f"  {key}: {value:.2f} ms")
    else:
        print(f"  {key}: {value:.2f}")

# 分析梯度累積優勢
analyze_gradient_accumulation_benefits()

## 4. 混合精度訓練優化

In [None]:
class MixedPrecisionDDPTrainer:
    """
    混合精度DDP訓練器
    結合自動混合精度和梯度縮放
    """
    
    def __init__(self, model, optimizer, scheduler=None, 
                 use_amp=True, gradient_accumulation_steps=1):
        self.model = model
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.use_amp = use_amp and torch.cuda.is_available()
        self.gradient_accumulation_steps = gradient_accumulation_steps
        
        # 混合精度相關
        if self.use_amp:
            self.scaler = GradScaler()
            print("✅ 啟用自動混合精度 (AMP)")
        else:
            self.scaler = None
            print("⚠️ 未啟用混合精度")
        
        # 性能監控
        self.performance_metrics = {
            'forward_time': [],
            'backward_time': [],
            'optimizer_time': [],
            'memory_usage': [],
            'loss_scale': []
        }
        
        self.step_count = 0
        self.accumulated_loss = 0.0
    
    def train_step(self, batch_data, criterion, clip_grad_norm=None):
        """單個訓練步驟"""
        start_time = time.time()
        
        # 前向傳播
        if self.use_amp:
            with autocast():
                outputs = self.model(batch_data['input'])
                loss = criterion(outputs, batch_data['target'])
                loss = loss / self.gradient_accumulation_steps
        else:
            outputs = self.model(batch_data['input'])
            loss = criterion(outputs, batch_data['target'])
            loss = loss / self.gradient_accumulation_steps
        
        forward_time = time.time() - start_time
        
        # 反向傳播
        start_time = time.time()
        if self.use_amp:
            self.scaler.scale(loss).backward()
        else:
            loss.backward()
        
        backward_time = time.time() - start_time
        self.accumulated_loss += loss.item()
        
        # 檢查是否需要參數更新
        if (self.step_count + 1) % self.gradient_accumulation_steps == 0:
            start_time = time.time()
            
            if self.use_amp:
                # 梯度縮放和更新
                if clip_grad_norm is not None:
                    self.scaler.unscale_(self.optimizer)
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), clip_grad_norm)
                
                self.scaler.step(self.optimizer)
                self.scaler.update()
                
                # 記錄loss scale
                self.performance_metrics['loss_scale'].append(self.scaler.get_scale())
            else:
                # 標準更新
                if clip_grad_norm is not None:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), clip_grad_norm)
                
                self.optimizer.step()
            
            self.optimizer.zero_grad()
            
            if self.scheduler is not None:
                self.scheduler.step()
            
            optimizer_time = time.time() - start_time
            
            # 記錄性能指標
            self.performance_metrics['forward_time'].append(forward_time * 1000)
            self.performance_metrics['backward_time'].append(backward_time * 1000)
            self.performance_metrics['optimizer_time'].append(optimizer_time * 1000)
            
            if torch.cuda.is_available():
                memory_usage = torch.cuda.memory_allocated() / (1024**2)  # MB
                self.performance_metrics['memory_usage'].append(memory_usage)
            
            avg_loss = self.accumulated_loss
            self.accumulated_loss = 0.0
            
            return avg_loss, True  # 返回損失和是否更新了參數
        
        self.step_count += 1
        return loss.item(), False
    
    def get_performance_summary(self):
        """獲取性能摘要"""
        summary = {}
        
        for metric, values in self.performance_metrics.items():
            if values:
                summary[f'{metric}_avg'] = np.mean(values)
                summary[f'{metric}_std'] = np.std(values)
                summary[f'{metric}_max'] = np.max(values)
                summary[f'{metric}_min'] = np.min(values)
        
        return summary

def compare_precision_modes():
    """比較不同精度模式的特性"""
    print("\n=== 精度模式比較 ===")
    
    precision_modes = {
        'FP32 (單精度)': {
            'memory_factor': 1.0,
            'speed_factor': 1.0,
            'accuracy': 'highest',
            'numerical_range': '±3.4e38',
            'mantissa_bits': 23
        },
        'FP16 (半精度)': {
            'memory_factor': 0.5,
            'speed_factor': 1.5,  # Tensor Core加速
            'accuracy': 'good',
            'numerical_range': '±6.5e4',
            'mantissa_bits': 10
        },
        'BF16 (Brain Float16)': {
            'memory_factor': 0.5,
            'speed_factor': 1.4,
            'accuracy': 'better',
            'numerical_range': '±3.4e38',
            'mantissa_bits': 7
        },
        'Mixed Precision': {
            'memory_factor': 0.6,  # 主要計算FP16，權重FP32
            'speed_factor': 1.6,
            'accuracy': 'high',
            'numerical_range': 'adaptive',
            'mantissa_bits': 'adaptive'
        }
    }
    
    for mode, props in precision_modes.items():
        print(f"\n{mode}:")
        print(f"  記憶體使用: {props['memory_factor']:.1f}x")
        print(f"  計算速度: {props['speed_factor']:.1f}x")
        print(f"  數值精度: {props['accuracy']}")
        print(f"  數值範圍: {props['numerical_range']}")
        print(f"  尾數位數: {props['mantissa_bits']}")
    
    print(f"\n🎯 選擇建議:")
    print(f"  - 最高精度需求: FP32")
    print(f"  - 平衡性能精度: Mixed Precision")
    print(f"  - 極致性能: FP16 (需要仔細調優)")
    print(f"  - 大模型訓練: BF16 (更穩定的FP16替代)")

# 演示混合精度訓練
print("=== 混合精度訓練演示 ===")

# 創建訓練器
demo_optimizer = torch.optim.AdamW(demo_model.parameters(), lr=1e-3)
mixed_precision_trainer = MixedPrecisionDDPTrainer(
    model=demo_model,
    optimizer=demo_optimizer,
    use_amp=True,
    gradient_accumulation_steps=2
)

# 模擬訓練數據
criterion = nn.CrossEntropyLoss()
for step in range(6):
    batch_data = {
        'input': torch.randn(8, 1024).to(device),
        'target': torch.randint(0, 10, (8,)).to(device)
    }
    
    loss, param_updated = mixed_precision_trainer.train_step(
        batch_data, criterion, clip_grad_norm=1.0
    )
    
    print(f"步驟 {step+1}: 損失 {loss:.4f}, 參數更新: {param_updated}")

# 性能摘要
perf_summary = mixed_precision_trainer.get_performance_summary()
print(f"\n=== 性能摘要 ===")
for key, value in perf_summary.items():
    if 'time' in key:
        print(f"{key}: {value:.2f} ms")
    elif 'memory' in key:
        print(f"{key}: {value:.1f} MB")
    elif 'scale' in key:
        print(f"{key}: {value:.0f}")

# 比較精度模式
compare_precision_modes()

## 5. 通訊與計算重疊

In [None]:
class ComputationCommunicationOverlap:
    """
    計算通訊重疊分析器
    分析和優化計算與通訊的重疊策略
    """
    
    def __init__(self, model):
        self.model = model
        self.layer_info = self._analyze_model_layers()
        
    def _analyze_model_layers(self):
        """分析模型層級結構"""
        layer_info = []
        
        for name, module in self.model.named_modules():
            if len(list(module.children())) == 0:  # 葉子節點
                param_count = sum(p.numel() for p in module.parameters())
                if param_count > 0:
                    layer_info.append({
                        'name': name,
                        'type': type(module).__name__,
                        'parameters': param_count,
                        'gradient_size_mb': param_count * 4 / (1024 * 1024)  # FP32
                    })
        
        return layer_info
    
    def analyze_overlap_potential(self):
        """分析重疊潛力"""
        print("\n=== 計算通訊重疊分析 ===")
        
        total_params = sum(layer['parameters'] for layer in self.layer_info)
        total_gradient_mb = sum(layer['gradient_size_mb'] for layer in self.layer_info)
        
        print(f"模型總參數: {total_params:,}")
        print(f"總梯度大小: {total_gradient_mb:.2f} MB")
        print()
        
        print("層級梯度分布:")
        for i, layer in enumerate(self.layer_info):
            percentage = layer['parameters'] / total_params * 100
            print(f"  {layer['name']:<20} ({layer['type']:<15}): "
                  f"{layer['parameters']:>8,} params ({percentage:5.1f}%), "
                  f"{layer['gradient_size_mb']:6.2f} MB")
        
        return self.layer_info
    
    def simulate_overlap_strategies(self):
        """模擬不同重疊策略"""
        print(f"\n=== 重疊策略模擬 ===")
        
        # 假設的計算和通訊時間
        compute_time_per_layer = 10  # ms
        communication_bandwidth = 10  # GB/s
        
        strategies = {
            '無重疊': {
                'description': '串行執行計算和通訊',
                'overlap_ratio': 0.0
            },
            '層級重疊': {
                'description': '按層重疊梯度通訊',
                'overlap_ratio': 0.7
            },
            '參數組重疊': {
                'description': '按參數組重疊通訊',
                'overlap_ratio': 0.8
            },
            '完全異步': {
                'description': '完全異步計算通訊',
                'overlap_ratio': 0.95
            }
        }
        
        total_compute_time = len(self.layer_info) * compute_time_per_layer
        total_gradient_mb = sum(layer['gradient_size_mb'] for layer in self.layer_info)
        total_comm_time = total_gradient_mb * 1000 / communication_bandwidth  # ms
        
        print(f"假設條件:")
        print(f"  每層計算時間: {compute_time_per_layer} ms")
        print(f"  通訊帶寬: {communication_bandwidth} GB/s")
        print(f"  總計算時間: {total_compute_time} ms")
        print(f"  總通訊時間: {total_comm_time:.1f} ms")
        print()
        
        for strategy, props in strategies.items():
            # 計算重疊後的總時間
            overlapped_comm_time = total_comm_time * (1 - props['overlap_ratio'])
            total_time = total_compute_time + overlapped_comm_time
            
            # 計算加速比
            baseline_time = total_compute_time + total_comm_time
            speedup = baseline_time / total_time
            
            efficiency = props['overlap_ratio'] * 100
            
            print(f"{strategy}:")
            print(f"  描述: {props['description']}")
            print(f"  重疊效率: {efficiency:.1f}%")
            print(f"  總時間: {total_time:.1f} ms")
            print(f"  加速比: {speedup:.2f}x")
            print()
        
        return strategies
    
    def recommend_optimization_strategy(self):
        """推薦優化策略"""
        print(f"\n=== 優化策略推薦 ===")
        
        total_params = sum(layer['parameters'] for layer in self.layer_info)
        
        recommendations = []
        
        # 基於模型大小的推薦
        if total_params < 1e6:  # 小模型
            recommendations.append("🔹 小模型 (<1M參數): 通訊開銷相對較小，重點優化計算")
            recommendations.append("  - 使用更大的批次大小")
            recommendations.append("  - 簡單的梯度累積即可")
        elif total_params < 1e8:  # 中等模型
            recommendations.append("🔹 中等模型 (1M-100M參數): 平衡計算和通訊優化")
            recommendations.append("  - 啟用混合精度訓練")
            recommendations.append("  - 使用層級梯度重疊")
            recommendations.append("  - 適度的梯度累積 (4-8步)")
        else:  # 大模型
            recommendations.append("🔹 大模型 (>100M參數): 重點優化通訊")
            recommendations.append("  - 啟用所有通訊優化")
            recommendations.append("  - 使用梯度壓縮")
            recommendations.append("  - 大步數梯度累積 (16+步)")
            recommendations.append("  - 考慮ZeRO優化器")
        
        # 基於層級分布的推薦
        max_layer_params = max(layer['parameters'] for layer in self.layer_info)
        min_layer_params = min(layer['parameters'] for layer in self.layer_info)
        param_variance = max_layer_params / min_layer_params if min_layer_params > 0 else float('inf')
        
        if param_variance > 10:
            recommendations.append("\n🔹 層級參數分布不均: 使用分層通訊策略")
            recommendations.append("  - 大層單獨通訊")
            recommendations.append("  - 小層聚合通訊")
        else:
            recommendations.append("\n🔹 層級參數分布均勻: 使用統一通訊策略")
            recommendations.append("  - 均勻的重疊策略")
            recommendations.append("  - 標準的All-Reduce")
        
        # 輸出推薦
        for rec in recommendations:
            print(rec)
        
        return recommendations

# 創建重疊分析器
overlap_analyzer = ComputationCommunicationOverlap(demo_model)

# 執行分析
layer_analysis = overlap_analyzer.analyze_overlap_potential()
overlap_strategies = overlap_analyzer.simulate_overlap_strategies()
optimization_recommendations = overlap_analyzer.recommend_optimization_strategy()

# DDP特定的重疊技術
print(f"\n=== DDP重疊技術詳解 ===")
print(f"\n1. 梯度重疊 (Gradient Overlapping):")
print(f"   - 計算後向梯度的同時進行All-Reduce")
   f"   - 使用DDP的bucket機制分批通訊")
print(f"   - 參數: ddp_comm_hook, bucket_size_mb")
print(f"\n2. 參數重疊 (Parameter Overlapping):")
print(f"   - 在計算梯度時重疊參數廣播")
print(f"   - 適用於ZeRO-3等參數分片方案")
print(f"   - 減少參數載入的等待時間")
print(f"\n3. 流水線重疊 (Pipeline Overlapping):")
print(f"   - 不同層的計算和通訊流水線執行")
print(f"   - 需要仔細的依賴關係管理")
print(f"   - 最大化硬體利用率")
print(f"\n4. 異步通訊 (Asynchronous Communication):")
print(f"   - 使用CUDA流實現真正的異步")
print(f"   - 需要同步點來保證正確性")
print(f"   - 平衡延遲和一致性")

## 6. 性能監控與調優

In [None]:
class DDPPerformanceMonitor:
    """
    DDP性能監控器
    實時監控和分析分散式訓練性能
    """
    
    def __init__(self, model, world_size=1):
        self.model = model
        self.world_size = world_size
        self.monitoring_data = {
            'timestamps': [],
            'gpu_memory': [],
            'gpu_utilization': [],
            'cpu_usage': [],
            'network_io': [],
            'step_times': [],
            'loss_values': [],
            'learning_rates': []
        }
        
        self.is_monitoring = False
        self.monitor_thread = None
        
    def start_monitoring(self, interval=1.0):
        """開始性能監控"""
        if self.is_monitoring:
            print("監控已在運行中")
            return
        
        self.is_monitoring = True
        self.monitor_thread = threading.Thread(
            target=self._monitoring_loop,
            args=(interval,),
            daemon=True
        )
        self.monitor_thread.start()
        print(f"✅ 開始性能監控 (間隔: {interval}s)")
    
    def stop_monitoring(self):
        """停止性能監控"""
        if not self.is_monitoring:
            print("監控未運行")
            return
        
        self.is_monitoring = False
        if self.monitor_thread:
            self.monitor_thread.join()
        print("⏹️ 性能監控已停止")
    
    def _monitoring_loop(self, interval):
        """監控循環"""
        while self.is_monitoring:
            try:
                timestamp = time.time()
                
                # GPU監控
                if torch.cuda.is_available():
                    gpu_memory = torch.cuda.memory_allocated() / (1024**2)  # MB
                    gpu_utilization = torch.cuda.utilization() if hasattr(torch.cuda, 'utilization') else 0
                else:
                    gpu_memory = 0
                    gpu_utilization = 0
                
                # CPU監控
                cpu_usage = psutil.cpu_percent()
                
                # 網絡IO (簡化)
                net_io = psutil.net_io_counters()
                network_io = net_io.bytes_sent + net_io.bytes_recv
                
                # 記錄數據
                self.monitoring_data['timestamps'].append(timestamp)
                self.monitoring_data['gpu_memory'].append(gpu_memory)
                self.monitoring_data['gpu_utilization'].append(gpu_utilization)
                self.monitoring_data['cpu_usage'].append(cpu_usage)
                self.monitoring_data['network_io'].append(network_io)
                
                time.sleep(interval)
                
            except Exception as e:
                print(f"監控錯誤: {e}")
                break
    
    def record_training_step(self, step_time, loss, learning_rate):
        """記錄訓練步驟數據"""
        self.monitoring_data['step_times'].append(step_time)
        self.monitoring_data['loss_values'].append(loss)
        self.monitoring_data['learning_rates'].append(learning_rate)
    
    def generate_performance_report(self):
        """生成性能報告"""
        print("\n=== DDP 性能報告 ===")
        
        # 基本統計
        if self.monitoring_data['step_times']:
            avg_step_time = np.mean(self.monitoring_data['step_times'])
            throughput = 1000 / avg_step_time  # steps per second
            print(f"\n訓練性能:")
            print(f"  平均步驟時間: {avg_step_time:.2f} ms")
            print(f"  訓練吞吐量: {throughput:.2f} steps/sec")
            print(f"  總訓練步數: {len(self.monitoring_data['step_times'])}")
        
        # 資源使用統計
        if self.monitoring_data['gpu_memory']:
            avg_gpu_mem = np.mean(self.monitoring_data['gpu_memory'])
            max_gpu_mem = np.max(self.monitoring_data['gpu_memory'])
            print(f"\nGPU資源:")
            print(f"  平均GPU記憶體: {avg_gpu_mem:.1f} MB")
            print(f"  峰值GPU記憶體: {max_gpu_mem:.1f} MB")
        
        if self.monitoring_data['cpu_usage']:
            avg_cpu = np.mean(self.monitoring_data['cpu_usage'])
            max_cpu = np.max(self.monitoring_data['cpu_usage'])
            print(f"\nCPU資源:")
            print(f"  平均CPU使用率: {avg_cpu:.1f}%")
            print(f"  峰值CPU使用率: {max_cpu:.1f}%")
        
        # 網絡統計
        if len(self.monitoring_data['network_io']) > 1:
            total_io = self.monitoring_data['network_io'][-1] - self.monitoring_data['network_io'][0]
            duration = self.monitoring_data['timestamps'][-1] - self.monitoring_data['timestamps'][0]
            avg_bandwidth = total_io / duration / (1024**2)  # MB/s
            print(f"\n網絡IO:")
            print(f"  總數據傳輸: {total_io / (1024**2):.1f} MB")
            print(f"  平均帶寬: {avg_bandwidth:.2f} MB/s")
        
        # 效率分析
        self._analyze_efficiency()
        
        return self.monitoring_data
    
    def _analyze_efficiency(self):
        """分析訓練效率"""
        print(f"\n=== 效率分析 ===")
        
        # 計算理論vs實際性能
        model_params = sum(p.numel() for p in self.model.parameters())
        model_flops = model_params * 2  # 簡化估算
        
        if self.monitoring_data['step_times']:
            avg_step_time_s = np.mean(self.monitoring_data['step_times']) / 1000
            actual_flops_per_sec = model_flops / avg_step_time_s
            
            print(f"計算效率:")
            print(f"  模型參數量: {model_params:,}")
            print(f"  估算FLOPS: {model_flops:,}")
            print(f"  實際FLOPS/秒: {actual_flops_per_sec:.2e}")
        
        # 通訊效率（僅多GPU時有意義）
        if self.world_size > 1:
            gradient_size_mb = model_params * 4 / (1024**2)  # FP32
            print(f"\n通訊效率:")
            print(f"  梯度大小: {gradient_size_mb:.2f} MB")
            print(f"  理論通訊時間 (10GB/s): {gradient_size_mb * 8 / 10:.2f} ms")
            if self.monitoring_data['step_times']:
                comm_overhead = avg_step_time_s * 1000 * 0.1  # 假設10%是通訊
                print(f"  估算通訊開銷: {comm_overhead:.2f} ms")
        
        # 擴展效率
        if self.world_size > 1:
            ideal_speedup = self.world_size
            if self.monitoring_data['step_times']:
                # 假設單GPU基準時間
                single_gpu_time = avg_step_time_s * self.world_size * 0.9  # 90%理想
                actual_speedup = single_gpu_time / avg_step_time_s
                efficiency = actual_speedup / ideal_speedup * 100
                
                print(f"\n擴展效率:")
                print(f"  理想加速比: {ideal_speedup:.1f}x")
                print(f"  實際加速比: {actual_speedup:.1f}x")
                print(f"  擴展效率: {efficiency:.1f}%")
    
    def plot_performance_curves(self):
        """繪製性能曲線"""
        if not self.monitoring_data['timestamps']:
            print("無監控數據可繪製")
            return
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # GPU記憶體使用
        if self.monitoring_data['gpu_memory']:
            axes[0, 0].plot(self.monitoring_data['gpu_memory'])
            axes[0, 0].set_title('GPU 記憶體使用')
            axes[0, 0].set_ylabel('記憶體 (MB)')
            axes[0, 0].grid(True, alpha=0.3)
        
        # CPU使用率
        if self.monitoring_data['cpu_usage']:
            axes[0, 1].plot(self.monitoring_data['cpu_usage'])
            axes[0, 1].set_title('CPU 使用率')
            axes[0, 1].set_ylabel('使用率 (%)')
            axes[0, 1].grid(True, alpha=0.3)
        
        # 訓練步驟時間
        if self.monitoring_data['step_times']:
            axes[1, 0].plot(self.monitoring_data['step_times'])
            axes[1, 0].set_title('訓練步驟時間')
            axes[1, 0].set_ylabel('時間 (ms)')
            axes[1, 0].set_xlabel('步驟')
            axes[1, 0].grid(True, alpha=0.3)
        
        # 損失曲線
        if self.monitoring_data['loss_values']:
            axes[1, 1].plot(self.monitoring_data['loss_values'])
            axes[1, 1].set_title('訓練損失')
            axes[1, 1].set_ylabel('損失')
            axes[1, 1].set_xlabel('步驟')
            axes[1, 1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig('ddp_performance_curves.png', dpi=150, bbox_inches='tight')
        plt.show()
        
        print("📊 性能曲線已保存到 ddp_performance_curves.png")

# 演示性能監控
print("=== DDP 性能監控演示 ===")

# 創建監控器
monitor = DDPPerformanceMonitor(demo_model, world_size=4)

# 開始監控
monitor.start_monitoring(interval=0.5)

# 模擬訓練過程
print("\n模擬訓練過程...")
optimizer = torch.optim.AdamW(demo_model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

for step in range(10):
    start_time = time.time()
    
    # 模擬前向傳播
    inputs = torch.randn(8, 1024).to(device)
    targets = torch.randint(0, 10, (8,)).to(device)
    
    optimizer.zero_grad()
    outputs = demo_model(inputs)
    loss = criterion(outputs, targets)
    loss.backward()
    optimizer.step()
    
    step_time = (time.time() - start_time) * 1000  # ms
    
    # 記錄訓練數據
    monitor.record_training_step(step_time, loss.item(), 1e-3)
    
    print(f"步驟 {step+1}: 損失 {loss.item():.4f}, 時間 {step_time:.2f}ms")
    
    time.sleep(0.1)  # 模擬訓練間隔

# 停止監控並生成報告
time.sleep(1)  # 確保收集到足夠數據
monitor.stop_monitoring()

# 生成性能報告
performance_data = monitor.generate_performance_report()

# 繪製性能曲線
monitor.plot_performance_curves()

## 7. 優化配置生成器

In [None]:
class DDPOptimizationConfigGenerator:
    """
    DDP優化配置生成器
    根據硬體環境和模型特性生成最優配置
    """
    
    def __init__(self):
        self.hardware_profiles = {
            'single_gpu': {
                'gpu_count': 1,
                'memory_per_gpu': 16,
                'bandwidth': 0,
                'description': '單GPU環境'
            },
            'dual_gpu': {
                'gpu_count': 2,
                'memory_per_gpu': 16,
                'bandwidth': 50,  # GB/s
                'description': '雙GPU工作站'
            },
            'quad_gpu': {
                'gpu_count': 4,
                'memory_per_gpu': 24,
                'bandwidth': 100,
                'description': '四GPU高端工作站'
            },
            'dgx_node': {
                'gpu_count': 8,
                'memory_per_gpu': 80,
                'bandwidth': 600,  # NVLink
                'description': 'DGX節點'
            },
            'multi_node': {
                'gpu_count': 32,
                'memory_per_gpu': 80,
                'bandwidth': 200,  # InfiniBand
                'description': '多節點集群'
            }
        }
    
    def analyze_model_requirements(self, model):
        """分析模型需求"""
        total_params = sum(p.numel() for p in model.parameters())
        
        # 估算記憶體需求 (FP32)
        model_memory = total_params * 4 / (1024**3)  # GB
        gradient_memory = model_memory
        optimizer_memory = model_memory * 2  # Adam state
        activation_memory = 2  # 估算值
        
        total_memory = model_memory + gradient_memory + optimizer_memory + activation_memory
        
        return {
            'total_params': total_params,
            'model_memory_gb': model_memory,
            'total_memory_gb': total_memory,
            'gradient_size_gb': gradient_memory
        }
    
    def generate_config(self, model, hardware_profile='auto', target_batch_size=32):
        """生成優化配置"""
        model_reqs = self.analyze_model_requirements(model)
        
        if hardware_profile == 'auto':
            hardware_profile = self._select_hardware_profile(model_reqs)
        
        hw_profile = self.hardware_profiles[hardware_profile]
        
        config = {
            'hardware': {
                'profile': hardware_profile,
                'gpu_count': hw_profile['gpu_count'],
                'memory_per_gpu': hw_profile['memory_per_gpu'],
                'bandwidth_gbps': hw_profile['bandwidth']
            },
            'model': model_reqs,
            'training': self._generate_training_config(model_reqs, hw_profile, target_batch_size),
            'optimization': self._generate_optimization_config(model_reqs, hw_profile),
            'monitoring': self._generate_monitoring_config(hw_profile)
        }
        
        return config
    
    def _select_hardware_profile(self, model_reqs):
        """自動選擇硬體配置"""
        gpu_count = torch.cuda.device_count() if torch.cuda.is_available() else 1
        
        if gpu_count == 1:
            return 'single_gpu'
        elif gpu_count == 2:
            return 'dual_gpu'
        elif gpu_count <= 4:
            return 'quad_gpu'
        elif gpu_count <= 8:
            return 'dgx_node'
        else:
            return 'multi_node'
    
    def _generate_training_config(self, model_reqs, hw_profile, target_batch_size):
        """生成訓練配置"""
        gpu_count = hw_profile['gpu_count']
        memory_per_gpu = hw_profile['memory_per_gpu']
        
        # 計算最大可行批次大小
        available_memory = memory_per_gpu * 0.8  # 留20%緩衝
        max_batch_per_gpu = max(1, int(available_memory / model_reqs['total_memory_gb']))
        
        # 計算梯度累積
        total_batch_capacity = max_batch_per_gpu * gpu_count
        if target_batch_size <= total_batch_capacity:
            micro_batch_size = target_batch_size // gpu_count
            gradient_accumulation = 1
        else:
            micro_batch_size = max_batch_per_gpu
            gradient_accumulation = target_batch_size // total_batch_capacity
        
        # 混合精度推薦
        use_amp = model_reqs['total_params'] > 1e6  # 大於1M參數推薦AMP
        
        return {
            'batch_size_per_gpu': micro_batch_size,
            'gradient_accumulation_steps': gradient_accumulation,
            'effective_batch_size': micro_batch_size * gpu_count * gradient_accumulation,
            'use_mixed_precision': use_amp,
            'gradient_clipping': 1.0,
            'dataloader_num_workers': min(4, max(1, gpu_count))
        }
    
    def _generate_optimization_config(self, model_reqs, hw_profile):
        """生成優化配置"""
        gpu_count = hw_profile['gpu_count']
        bandwidth = hw_profile['bandwidth']
        
        # 通訊後端
        if gpu_count > 1:
            backend = 'nccl'
        else:
            backend = None
        
        # DDP參數
        bucket_size_mb = 25  # 預設值
        if model_reqs['gradient_size_gb'] * 1024 < bucket_size_mb:
            bucket_size_mb = max(1, int(model_reqs['gradient_size_gb'] * 1024))
        
        # 通訊優化
        use_gradient_compression = model_reqs['total_params'] > 1e8 and gpu_count > 4
        overlap_communication = gpu_count > 2
        
        config = {
            'backend': backend,
            'bucket_size_mb': bucket_size_mb,
            'find_unused_parameters': False,
            'broadcast_buffers': True,
            'gradient_as_bucket_view': True
        }
        
        if use_gradient_compression:
            config['gradient_compression'] = {
                'method': 'fp16',
                'ratio': 2.0
            }
        
        if overlap_communication:
            config['communication_overlap'] = {
                'enabled': True,
                'overlap_ratio': 0.8
            }
        
        return config
    
    def _generate_monitoring_config(self, hw_profile):
        """生成監控配置"""
        return {
            'log_interval': 50,
            'save_interval': 1000,
            'eval_interval': 500,
            'monitor_gpu_memory': True,
            'monitor_communication': hw_profile['gpu_count'] > 1,
            'profile_steps': 100 if hw_profile['gpu_count'] > 1 else 0
        }
    
    def print_config(self, config):
        """打印配置"""
        print("\n=== DDP 優化配置 ===")
        
        print(f"\n🖥️ 硬體配置:")
        hw = config['hardware']
        print(f"  配置類型: {hw['profile']}")
        print(f"  GPU數量: {hw['gpu_count']}")
        print(f"  每GPU記憶體: {hw['memory_per_gpu']} GB")
        print(f"  網絡帶寬: {hw['bandwidth_gbps']} GB/s")
        
        print(f"\n🤖 模型分析:")
        model = config['model']
        print(f"  參數量: {model['total_params']:,}")
        print(f"  模型記憶體: {model['model_memory_gb']:.2f} GB")
        print(f"  總記憶體需求: {model['total_memory_gb']:.2f} GB")
        
        print(f"\n🚀 訓練配置:")
        train = config['training']
        print(f"  每GPU批次大小: {train['batch_size_per_gpu']}")
        print(f"  梯度累積步數: {train['gradient_accumulation_steps']}")
        print(f"  有效批次大小: {train['effective_batch_size']}")
        print(f"  混合精度: {'✅' if train['use_mixed_precision'] else '❌'}")
        print(f"  梯度裁剪: {train['gradient_clipping']}")
        
        print(f"\n⚡ 優化配置:")
        opt = config['optimization']
        print(f"  通訊後端: {opt['backend'] or 'N/A'}")
        print(f"  DDP桶大小: {opt['bucket_size_mb']} MB")
        print(f"  查找未使用參數: {'✅' if opt['find_unused_parameters'] else '❌'}")
        
        if 'gradient_compression' in opt:
            gc = opt['gradient_compression']
            print(f"  梯度壓縮: {gc['method']} ({gc['ratio']}x)")
        
        if 'communication_overlap' in opt:
            co = opt['communication_overlap']
            print(f"  通訊重疊: ✅ ({co['overlap_ratio']:.1%})")
        
        print(f"\n📊 監控配置:")
        mon = config['monitoring']
        print(f"  日誌間隔: {mon['log_interval']}步")
        print(f"  保存間隔: {mon['save_interval']}步")
        print(f"  驗證間隔: {mon['eval_interval']}步")
        print(f"  GPU監控: {'✅' if mon['monitor_gpu_memory'] else '❌'}")
        print(f"  通訊監控: {'✅' if mon['monitor_communication'] else '❌'}")
    
    def save_config_file(self, config, filename='ddp_config.json'):
        """保存配置文件"""
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(config, f, indent=2, ensure_ascii=False)
        print(f"\n💾 配置已保存到 {filename}")

# 演示配置生成
print("=== DDP 配置生成演示 ===")

config_generator = DDPOptimizationConfigGenerator()

# 為示例模型生成配置
optimized_config = config_generator.generate_config(
    model=demo_model,
    hardware_profile='auto',
    target_batch_size=64
)

# 打印配置
config_generator.print_config(optimized_config)

# 保存配置
config_generator.save_config_file(optimized_config, 'optimal_ddp_config.json')

# 生成不同硬體環境的配置對比
print("\n=== 不同硬體環境配置對比 ===")
hardware_types = ['single_gpu', 'quad_gpu', 'dgx_node']

for hw_type in hardware_types:
    print(f"\n--- {hw_type.upper()} ---")
    test_config = config_generator.generate_config(
        model=demo_model,
        hardware_profile=hw_type,
        target_batch_size=64
    )
    
    # 只打印關鍵信息
    hw = test_config['hardware']
    train = test_config['training']
    print(f"GPU: {hw['gpu_count']}, 批次: {train['batch_size_per_gpu']}, "
          f"累積: {train['gradient_accumulation_steps']}, "
          f"有效批次: {train['effective_batch_size']}, "
          f"AMP: {'✅' if train['use_mixed_precision'] else '❌'}")

## 8. 總結與最佳實踐

In [None]:
print("=== Lab-1.2 Optimization 完成總結 ===")
print()
print("✅ 已完成的優化技術學習:")
print("  1. ✅ 通訊分析與瓶頸識別")
print("  2. ✅ 梯度累積優化策略")
print("  3. ✅ 混合精度訓練配置")
print("  4. ✅ 計算通訊重疊技術")
print("  5. ✅ 性能監控與分析")
print("  6. ✅ 自動化配置生成")
print()

print("🎯 關鍵優化策略總結:")
print()

print("📡 通訊優化:")
print("  • 使用適當的通訊後端 (NCCL for GPU)")
print("  • 調整DDP bucket大小以平衡延遲和吞吐量")
print("  • 啟用梯度壓縮 (FP16/BF16) 減少數據傳輸")
print("  • 使用梯度累積減少通訊頻率")
print()

print("💾 記憶體優化:")
print("  • 混合精度訓練 (AMP) 節省記憶體")
print("  • 梯度檢查點技術適用於超大模型")
print("  • 優化器狀態卸載到CPU (ZeRO-2)")
print("  • 動態批次大小調整")
print()

print("⚡ 計算優化:")
print("  • 計算通訊重疊最大化硬體利用率")
print("  • 使用編譯優化 (torch.compile)")
print("  • Tensor Core優化混合精度運算")
print("  • 適當的數據載入並行度")
print()

print("📊 監控優化:")
print("  • 實時性能監控識別瓶頸")
print("  • GPU利用率和記憶體使用追蹤")
print("  • 通訊開銷分析")
print("  • 自動化性能調優")
print()

print("🔧 配置最佳實踐:")
optimization_best_practices = {
    '小模型 (<10M參數)': {
        'batch_size': '盡量大',
        'gradient_accumulation': '1-2步',
        'mixed_precision': '可選',
        'communication': '標準設置'
    },
    '中型模型 (10M-1B參數)': {
        'batch_size': '適中',
        'gradient_accumulation': '4-8步',
        'mixed_precision': '強烈推薦',
        'communication': '梯度壓縮'
    },
    '大型模型 (>1B參數)': {
        'batch_size': '記憶體限制',
        'gradient_accumulation': '16+步',
        'mixed_precision': '必需',
        'communication': '全面優化'
    }
}

for model_size, practices in optimization_best_practices.items():
    print(f"\n{model_size}:")
    for aspect, recommendation in practices.items():
        print(f"  • {aspect}: {recommendation}")

print()
print("🚀 多GPU環境實戰建議:")
print("  1. 從小規模開始，逐步擴展到多GPU")
print("  2. 使用性能監控識別瓶頸")
print("  3. 測試不同的批次大小和累積策略")
print("  4. 驗證數值穩定性和收斂性")
print("  5. 建立自動化調優流程")
print()

print("📁 生成的工具和配置:")
print("  • ddp_performance_curves.png: 性能監控圖表")
print("  • optimal_ddp_config.json: 優化配置文件")
print("  • DDPCommunicationAnalyzer: 通訊分析工具")
print("  • MixedPrecisionDDPTrainer: 混合精度訓練器")
print("  • DDPPerformanceMonitor: 性能監控器")
print("  • DDPOptimizationConfigGenerator: 配置生成器")
print()

print("📝 下一步學習:")
print("  - 04-Advanced.ipynb: 進階技術和故障處理")
print("  - Lab-1.3: DeepSpeed ZeRO 優化")
print("  - 在真實多GPU環境中驗證優化效果")
print()

print("💡 關鍵記憶點:")
print("  - 通訊開銷隨GPU數量線性增長")
print("  - 梯度累積是減少通訊的最有效方法")
print("  - 混合精度既節省記憶體又提升速度")
print("  - 計算通訊重疊可獲得顯著加速")
print("  - 性能監控是持續優化的基礎")

# 保存優化總結
optimization_summary = {
    'techniques_covered': [
        'Communication Analysis',
        'Gradient Accumulation',
        'Mixed Precision Training',
        'Computation-Communication Overlap',
        'Performance Monitoring',
        'Automated Configuration'
    ],
    'tools_created': [
        'DDPCommunicationAnalyzer',
        'GradientAccumulationOptimizer', 
        'MixedPrecisionDDPTrainer',
        'ComputationCommunicationOverlap',
        'DDPPerformanceMonitor',
        'DDPOptimizationConfigGenerator'
    ],
    'best_practices': optimization_best_practices,
    'files_generated': [
        'ddp_performance_curves.png',
        'optimal_ddp_config.json'
    ]
}

with open('ddp_optimization_summary.json', 'w', encoding='utf-8') as f:
    json.dump(optimization_summary, f, indent=2, ensure_ascii=False)

print("\n💾 優化總結已保存到 ddp_optimization_summary.json")