# 6.3.2 性能優化挑戰 - 高級練習

本練習專注於計算機視覺算法的極致性能優化，包括多線程、GPU加速、算法優化等高級技術。

## 練習目標
- 掌握性能分析和瓶頸識別
- 實現多種優化技術
- 學習並行計算和GPU加速
- 達到生產級性能要求
- 建立性能監控體系

## 難度等級: ⭐⭐⭐⭐⭐ (專家級)

In [None]:
import cv2
import numpy as np
import time
import threading
import multiprocessing as mp
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import queue
import psutil
import sys
from numba import jit, cuda
import cProfile
import pstats
from memory_profiler import profile

sys.path.append('../../utils')
from performance import time_function, benchmark_function

print(f"✅ 環境設置完成")
print(f"CPU核心數: {mp.cpu_count()}")
print(f"CUDA設備數: {cv2.cuda.getCudaEnabledDeviceCount()}")

## 挑戰1: 性能分析與瓶頸識別

In [None]:
class PerformanceProfiler:
    """性能分析器"""
    
    def __init__(self):
        self.metrics = {}
        
    def profile_function(self, func, *args, **kwargs):
        """分析函數性能"""
        # CPU分析
        profiler = cProfile.Profile()
        profiler.enable()
        
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        
        profiler.disable()
        
        # 獲取統計信息
        stats = pstats.Stats(profiler)
        
        return {
            'result': result,
            'execution_time': end_time - start_time,
            'profile_stats': stats
        }
    
    def benchmark_algorithms(self, algorithms, test_data):
        """基準測試多個算法"""
        results = {}
        
        for name, algorithm in algorithms.items():
            print(f"🔄 測試 {name}...")
            
            times = []
            for _ in range(5):  # 運行5次取平均
                start = time.time()
                algorithm(test_data)
                times.append((time.time() - start) * 1000)
            
            results[name] = {
                'mean_time': np.mean(times),
                'std_time': np.std(times),
                'min_time': np.min(times),
                'max_time': np.max(times)
            }
            
            print(f"  平均耗時: {results[name]['mean_time']:.2f} ± {results[name]['std_time']:.2f}ms")
        
        return results

# 創建性能分析器
profiler = PerformanceProfiler()

## 挑戰2: 多線程優化

In [None]:
class OptimizedImageProcessor:
    """優化的圖像處理器"""
    
    def __init__(self, num_workers=4):
        self.num_workers = num_workers
        
    def process_single_image(self, image):
        """單線程圖像處理"""
        # 模擬複雜處理
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        blurred = cv2.GaussianBlur(gray, (15, 15), 0)
        edges = cv2.Canny(blurred, 50, 150)
        
        # 形態學操作
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
        processed = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel)
        
        return processed
    
    def process_images_sequential(self, images):
        """順序處理圖像"""
        results = []
        start_time = time.time()
        
        for img in images:
            result = self.process_single_image(img)
            results.append(result)
        
        total_time = time.time() - start_time
        return results, total_time
    
    def process_images_threaded(self, images):
        """多線程處理圖像"""
        start_time = time.time()
        
        with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
            results = list(executor.map(self.process_single_image, images))
        
        total_time = time.time() - start_time
        return results, total_time
    
    def process_images_multiprocess(self, images):
        """多進程處理圖像"""
        start_time = time.time()
        
        with ProcessPoolExecutor(max_workers=self.num_workers) as executor:
            results = list(executor.map(self.process_single_image, images))
        
        total_time = time.time() - start_time
        return results, total_time

# 創建優化處理器
processor = OptimizedImageProcessor()

# 創建測試圖像集
test_images = [np.random.randint(0, 255, (500, 500, 3), dtype=np.uint8) 
               for _ in range(20)]

print(f"📊 創建了 {len(test_images)} 張測試圖像")

## 挑戰3: GPU加速優化

In [None]:
@jit(nopython=True)
def numba_gaussian_blur(image, kernel_size=5):
    """Numba加速的高斯模糊"""
    height, width = image.shape
    result = np.zeros_like(image)
    
    # 簡化的高斯核
    sigma = kernel_size / 3.0
    kernel = np.zeros((kernel_size, kernel_size))
    center = kernel_size // 2
    
    for i in range(kernel_size):
        for j in range(kernel_size):
            x, y = i - center, j - center
            kernel[i, j] = np.exp(-(x*x + y*y) / (2 * sigma * sigma))
    
    kernel = kernel / np.sum(kernel)
    
    # 卷積操作
    for i in range(center, height - center):
        for j in range(center, width - center):
            conv_sum = 0.0
            for ki in range(kernel_size):
                for kj in range(kernel_size):
                    conv_sum += image[i - center + ki, j - center + kj] * kernel[ki, kj]
            result[i, j] = conv_sum
    
    return result

class GPUAcceleratedProcessor:
    """GPU加速處理器"""
    
    def __init__(self):
        self.cuda_available = cv2.cuda.getCudaEnabledDeviceCount() > 0
        print(f"CUDA可用: {'是' if self.cuda_available else '否'}")
    
    def process_cpu(self, image):
        """CPU處理"""
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        blurred = cv2.GaussianBlur(gray, (15, 15), 0)
        edges = cv2.Canny(blurred, 50, 150)
        return edges
    
    def process_gpu(self, image):
        """GPU處理"""
        if not self.cuda_available:
            return self.process_cpu(image)
        
        try:
            # 上傳到GPU
            gpu_image = cv2.cuda_GpuMat()
            gpu_image.upload(image)
            
            # GPU上的色彩轉換
            gpu_gray = cv2.cuda.cvtColor(gpu_image, cv2.COLOR_BGR2GRAY)
            
            # GPU高斯模糊
            gpu_blurred = cv2.cuda.GaussianBlur(gpu_gray, (15, 15), 0)
            
            # GPU邊緣檢測
            gpu_edges = cv2.cuda.Canny(gpu_blurred, 50, 150)
            
            # 下載結果
            result = gpu_edges.download()
            
            return result
        except Exception as e:
            print(f"GPU處理失敗: {e}")
            return self.process_cpu(image)
    
    def process_numba(self, image):
        """Numba加速處理"""
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        blurred = numba_gaussian_blur(gray.astype(np.float64))
        edges = cv2.Canny(blurred.astype(np.uint8), 50, 150)
        return edges

# 創建GPU加速處理器
gpu_processor = GPUAcceleratedProcessor()

## 挑戰4: 算法優化比較

In [None]:
def run_optimization_benchmark():
    """運行優化基準測試"""
    print("🏃‍♂️ 開始性能優化基準測試")
    
    # 創建測試數據
    test_image = np.random.randint(0, 255, (1000, 1000, 3), dtype=np.uint8)
    
    # 定義測試算法
    algorithms = {
        'CPU標準': gpu_processor.process_cpu,
        'GPU加速': gpu_processor.process_gpu,
        'Numba加速': gpu_processor.process_numba
    }
    
    # 執行基準測試
    results = profiler.benchmark_algorithms(algorithms, test_image)
    
    print("\n📊 性能比較結果:")
    print("-" * 50)
    
    # 找到最快的方法作為基準
    fastest_time = min(result['mean_time'] for result in results.values())
    
    for name, metrics in results.items():
        speedup = fastest_time / metrics['mean_time']
        print(f"{name:12}: {metrics['mean_time']:6.1f}ms (相對速度: {speedup:.1f}x)")
    
    return results

# 運行基準測試
optimization_results = run_optimization_benchmark()

## 挑戰5: 記憶體優化

In [None]:
class MemoryOptimizer:
    """記憶體優化器"""
    
    def __init__(self):
        self.process = psutil.Process()
        
    def get_memory_usage(self):
        """獲取當前記憶體使用量"""
        return self.process.memory_info().rss / 1024 / 1024  # MB
    
    def memory_efficient_processing(self, large_image):
        """記憶體高效處理"""
        initial_memory = self.get_memory_usage()
        
        # 分塊處理大圖像
        h, w = large_image.shape[:2]
        block_size = 256
        
        result = np.zeros_like(large_image[:, :, 0])  # 只保存一個通道
        
        for y in range(0, h, block_size):
            for x in range(0, w, block_size):
                # 提取塊
                y_end = min(y + block_size, h)
                x_end = min(x + block_size, w)
                
                block = large_image[y:y_end, x:x_end]
                
                # 處理塊
                gray_block = cv2.cvtColor(block, cv2.COLOR_BGR2GRAY)
                processed_block = cv2.GaussianBlur(gray_block, (5, 5), 0)
                
                # 存儲結果
                result[y:y_end, x:x_end] = processed_block
                
                # 強制垃圾回收
                del block, gray_block, processed_block
        
        final_memory = self.get_memory_usage()
        memory_used = final_memory - initial_memory
        
        return result, memory_used
    
    def memory_intensive_processing(self, large_image):
        """記憶體密集處理（對比用）"""
        initial_memory = self.get_memory_usage()
        
        # 創建多個大型中間結果
        gray = cv2.cvtColor(large_image, cv2.COLOR_BGR2GRAY)
        blurred1 = cv2.GaussianBlur(gray, (5, 5), 0)
        blurred2 = cv2.GaussianBlur(gray, (15, 15), 0)
        blurred3 = cv2.GaussianBlur(gray, (25, 25), 0)
        
        # 合併結果
        result = (blurred1.astype(np.float32) + blurred2.astype(np.float32) + blurred3.astype(np.float32)) / 3
        result = result.astype(np.uint8)
        
        final_memory = self.get_memory_usage()
        memory_used = final_memory - initial_memory
        
        return result, memory_used

# 測試記憶體優化
memory_optimizer = MemoryOptimizer()

# 創建大圖像進行測試
large_test_image = np.random.randint(0, 255, (2000, 2000, 3), dtype=np.uint8)
print(f"📏 大圖像尺寸: {large_test_image.shape}")

# 比較記憶體使用
print("\n🧠 記憶體使用比較:")

# 記憶體高效方法
result1, memory1 = memory_optimizer.memory_efficient_processing(large_test_image)
print(f"記憶體高效處理: {memory1:.1f} MB")

# 記憶體密集方法
result2, memory2 = memory_optimizer.memory_intensive_processing(large_test_image)
print(f"記憶體密集處理: {memory2:.1f} MB")

print(f"記憶體節省: {((memory2 - memory1) / memory2 * 100):.1f}%")

## 總結與性能評估

### 🏆 優化技術掌握度檢核
- [ ] 實現了性能分析和瓶頸識別
- [ ] 掌握了多線程並行優化
- [ ] 理解了GPU加速的原理和應用
- [ ] 實現了記憶體使用優化
- [ ] 建立了完整的基準測試框架

### 📊 性能優化成果
- CPU vs GPU加速比較
- 多線程 vs 單線程效率提升
- 記憶體使用優化效果
- 算法複雜度分析

### 🎯 實際應用價值
這些優化技術直接應用於:
- 實時視頻處理系統
- 大規模圖像處理管道
- 邊緣計算設備部署
- 雲端服務性能提升

**完成此挑戰表示已具備生產級性能優化能力！** 🎊