## 📖 總結\n\n本實驗完成了企業級 Triton 推理服務的高級特性實現：\n\n### 🎯 實驗成果\n1. **模型熱更新系統** - 實現了零停機時間的藍綠部署機制\n2. **自動故障轉移** - 構建了智能化的高可用性架構\n3. **企業級監控** - 建立了全面的健康檢查和告警體系\n4. **最佳實踐指南** - 提供了完整的企業級部署參考\n\n### 🔧 關鍵技術點\n- 企業級架構設計模式\n- 零停機部署策略實現\n- 自動化故障檢測和恢復\n- 多層次監控和告警機制\n\n### 🚀 實際應用價值\n1. **業務連續性保障** - 99.9% 以上的服務可用性\n2. **風險控制能力** - 自動化的故障檢測和恢復\n3. **運維效率提升** - 智能化的部署和管理流程\n4. **成本效益優化** - 減少人工干預和停機損失\n\n### 💡 學習要點\n- 企業級部署需要考慮業務連續性和風險控制\n- 自動化是提升運維效率的關鍵\n- 監控和可觀測性是系統穩定運行的基礎\n- 最佳實踐的應用可以避免常見陷阱\n\n### 🔮 未來發展方向\n- **服務網格整合** - 與 Istio、Linkerd 等的深度整合\n- **AI 驅動運維** - 智能化的性能調優和故障預測\n- **邊緣計算支持** - 分散式推理架構的企業級支持\n- **多雲部署** - 跨雲平台的一致性管理\n\n---\n\n**🎉 恭喜完成 Lab 2.4.4！**\n\n您已經掌握了企業級 Triton 推理服務的核心技術，可以構建和管理大規模生產環境的 AI 推理系統。這些技能將為您在企業級 AI 部署領域提供強大的競爭優勢。"

In [None]:
# 企業級高級特性最佳實踐指南\nbest_practices_guide = \"\"\"\n🏢 企業級 Triton 推理服務最佳實踐\n\n📋 模型熱更新策略:\n   ✅ 藍綠部署 - 適用於關鍵業務系統，零停機時間\n   ✅ 金絲雀發布 - 適用於風險控制，漸進式驗證\n   ✅ 滾動更新 - 適用於多實例環境，逐步替換\n   ✅ A/B 測試 - 適用於性能比較，數據驅動決策\n\n🔄 故障轉移機制:\n   ✅ 多層次健康檢查 (連接、響應時間、業務邏輯)\n   ✅ 智能端點選擇 (優先級、權重、地理位置)\n   ✅ 自動化故障恢復 (重試機制、熔斷器模式)\n   ✅ 即時告警和通知 (多渠道、分級告警)\n\n🚦 流量管理原則:\n   ✅ 基於規則的路由 (版本、用戶、地區)\n   ✅ 動態負載均衡 (加權輪詢、最少連接)\n   ✅ 流量限制和整形 (速率限制、突發處理)\n   ✅ 服務網格整合 (Istio、Linkerd 支持)\n\n📊 監控和可觀測性:\n   ✅ 全鏈路追踪 (請求生命週期)\n   ✅ 多維度指標 (延遲、吞吐量、錯誤率)\n   ✅ 實時告警系統 (閾值、趨勢、異常)\n   ✅ 運維儀表板 (可視化、互動式)\n\n🛡️  安全和合規:\n   ✅ 端到端加密 (TLS/mTLS)\n   ✅ 身份認證和授權 (RBAC、OAuth2)\n   ✅ 審計日誌 (操作記錄、合規追踪)\n   ✅ 資料保護 (敏感資訊遮罩)\n\n🔧 運維自動化:\n   ✅ 基礎設施即代碼 (Terraform、Helm)\n   ✅ CI/CD 管道整合 (GitOps 工作流)\n   ✅ 自動化測試 (單元、整合、端到端)\n   ✅ 容災恢復 (備份、複製、還原)\n\n💡 成功關鍵因素:\n   🎯 明確的 SLA 目標 (可用性、性能、容量)\n   📈 數據驅動的決策 (指標、趨勢、分析)\n   🔄 持續改進流程 (回顧、優化、創新)\n   👥 跨團隊協作 (開發、運維、業務)\n   📚 知識管理和文檔 (最佳實踐、故障手冊)\n\n⚠️  常見陷阱避免:\n   ❌ 過度複雜的架構設計\n   ❌ 缺乏充分的測試覆蓋\n   ❌ 忽視監控和告警設置\n   ❌ 未建立適當的回滾機制\n   ❌ 缺乏災難恢復計劃\n\"\"\"\n\nprint(best_practices_guide)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

## 📊 最佳實踐總結

In [None]:
# 故障轉移演示\nasync def demonstrate_failover():\n    \"\"\"演示自動故障轉移\"\"\"\n    print(\"🚀 開始故障轉移演示\")\n    print(\"=\" * 60)\n    \n    # 啟動監控\n    await failover_manager.start_monitoring()\n    \n    # 模擬一段時間的正常運行\n    print(\"\\n⏱️  模擬 30 秒正常運行...\")\n    await asyncio.sleep(3)  # 簡化為3秒\n    \n    # 顯示服務狀態\n    print(\"\\n📊 當前服務狀態:\")\n    for service_name in failover_manager.services.keys():\n        status = failover_manager.get_service_status(service_name)\n        print(f\"\\n🔧 服務: {service_name}\")\n        print(f\"   總端點數: {status['total_endpoints']}\")\n        print(f\"   健康端點: {status['healthy_endpoints']}\")\n        print(f\"   警告端點: {status['warning_endpoints']}\")\n        print(f\"   故障端點: {status['critical_endpoints']}\")\n        print(f\"   主要端點: {status['primary_endpoint']}\")\n    \n    # 模擬主要端點故障\n    print(\"\\n⚠️  模擬主要端點故障...\")\n    primary_endpoint = failover_manager.services[\"triton_inference\"][0]  # triton-primary\n    \n    # 手動設置故障狀態來模擬故障\n    primary_endpoint.health_status = HealthStatus.CRITICAL\n    primary_endpoint.consecutive_failures = 5\n    primary_endpoint.response_time_ms = 8000\n    primary_endpoint.success_rate = 20.0\n    \n    print(f\"   {primary_endpoint.name} 已被標記為故障\")\n    \n    # 執行一輪健康檢查和故障轉移評估\n    print(\"\\n🏥 執行故障檢測和轉移...\")\n    await failover_manager._evaluate_failover_rules()\n    \n    # 顯示故障轉移後的狀態\n    print(\"\\n📊 故障轉移後的服務狀態:\")\n    status = failover_manager.get_service_status(\"triton_inference\")\n    print(f\"   健康端點: {status['healthy_endpoints']}\")\n    print(f\"   故障端點: {status['critical_endpoints']}\")\n    \n    # 模擬端點恢復\n    print(\"\\n✅ 模擬主要端點恢復...\")\n    primary_endpoint.health_status = HealthStatus.HEALTHY\n    primary_endpoint.consecutive_failures = 0\n    primary_endpoint.response_time_ms = 150\n    primary_endpoint.success_rate = 99.5\n    \n    await failover_manager._log_failover_event(\"triton_inference\", primary_endpoint, \"recovery\")\n    \n    # 停止監控\n    await failover_manager.stop_monitoring()\n    \n    print(\"\\n\" + \"=\" * 60)\n    print(\"🎉 故障轉移演示完成\")\n    \n    # 顯示故障轉移歷史\n    if failover_manager.failover_history:\n        print(f\"\\n📜 故障轉移事件記錄 ({len(failover_manager.failover_history)} 個事件):\")\n        for i, event in enumerate(failover_manager.failover_history[-3:], 1):  # 顯示最近3個事件\n            print(f\"   {i}. {event['event_type']} - {event['affected_endpoint']['name']} \")\n            print(f\"      時間: {event['timestamp'].strftime('%H:%M:%S')}\")\n            if 'backup_endpoint' in event:\n                print(f\"      備用: {event['backup_endpoint']['name']}\")\n\n\n# 執行故障轉移演示\nawait demonstrate_failover()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

In [None]:
# 配置故障轉移規則\nprint(\"📋 配置故障轉移規則...\")\n\n# 定義故障轉移規則\nfailover_rules = [\n    FailoverRule(\n        name=\"連續失敗觸發故障轉移\",\n        trigger_condition=\"consecutive_failures\",\n        threshold_value=3,\n        action=\"failover\",\n        cooldown_seconds=30\n    ),\n    FailoverRule(\n        name=\"響應時間過長告警\",\n        trigger_condition=\"response_time\",\n        threshold_value=5000,  # 5秒\n        action=\"alert\",\n        cooldown_seconds=60\n    ),\n    FailoverRule(\n        name=\"成功率過低故障轉移\",\n        trigger_condition=\"success_rate\",\n        threshold_value=50.0,  # 50%\n        action=\"failover\",\n        cooldown_seconds=120\n    ),\n    FailoverRule(\n        name=\"嚴重狀態移除端點\",\n        trigger_condition=\"health_status\",\n        threshold_value=1,  # critical\n        action=\"remove\",\n        cooldown_seconds=300\n    )\n]\n\n# 添加規則到管理器\nfor rule in failover_rules:\n    failover_manager.add_failover_rule(rule)\n\nprint(f\"\\n✅ 已配置 {len(failover_rules)} 個故障轉移規則\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

In [None]:
# 配置服務端點\nprint(\"🔧 配置故障轉移服務端點...\")\n\n# 創建 Triton 推理服務端點\ntriton_endpoints = [\n    ServiceEndpoint(\n        name=\"triton-primary\",\n        url=\"localhost:8000\",\n        priority=1,\n        weight=100,\n        metadata={\"region\": \"us-west\", \"zone\": \"us-west-1a\"}\n    ),\n    ServiceEndpoint(\n        name=\"triton-secondary\",\n        url=\"localhost:8001\",\n        priority=2,\n        weight=50,\n        metadata={\"region\": \"us-west\", \"zone\": \"us-west-1b\"}\n    ),\n    ServiceEndpoint(\n        name=\"triton-backup\",\n        url=\"localhost:8002\",\n        priority=3,\n        weight=25,\n        metadata={\"region\": \"us-east\", \"zone\": \"us-east-1a\"}\n    )\n]\n\n# 創建 API 閘道服務端點\napi_gateway_endpoints = [\n    ServiceEndpoint(\n        name=\"gateway-main\",\n        url=\"http://api.example.com\",\n        priority=1,\n        weight=100,\n        metadata={\"type\": \"nginx\", \"version\": \"1.20\"}\n    ),\n    ServiceEndpoint(\n        name=\"gateway-backup\",\n        url=\"http://backup-api.example.com\",\n        priority=2,\n        weight=50,\n        metadata={\"type\": \"nginx\", \"version\": \"1.20\"}\n    )\n]\n\n# 註冊服務\nfailover_manager.register_service(\"triton_inference\", triton_endpoints)\nfailover_manager.register_service(\"api_gateway\", api_gateway_endpoints)\n\nprint(f\"\\n✅ 已註冊 {len(failover_manager.services)} 個服務\")\nfor service_name, endpoints in failover_manager.services.items():\n    print(f\"   {service_name}: {len(endpoints)} 個端點\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

### 2.2 故障轉移配置和演示

In [None]:
class HealthStatus(Enum):\n    \"\"\"健康狀態枚舉\"\"\"\n    HEALTHY = \"healthy\"\n    WARNING = \"warning\"\n    CRITICAL = \"critical\"\n    UNKNOWN = \"unknown\"\n\n\n@dataclass\nclass ServiceEndpoint:\n    \"\"\"服務端點信息\"\"\"\n    name: str\n    url: str\n    priority: int  # 1=主要, 2=次要, 3=備用\n    weight: int = 1\n    health_status: HealthStatus = HealthStatus.UNKNOWN\n    last_check: datetime = field(default_factory=datetime.now)\n    consecutive_failures: int = 0\n    response_time_ms: float = 0.0\n    success_rate: float = 100.0\n    metadata: Dict[str, Any] = field(default_factory=dict)\n    \n    def is_available(self) -> bool:\n        \"\"\"檢查端點是否可用\"\"\"\n        return self.health_status in [HealthStatus.HEALTHY, HealthStatus.WARNING]\n\n\n@dataclass\nclass FailoverRule:\n    \"\"\"故障轉移規則\"\"\"\n    name: str\n    trigger_condition: str  # \"consecutive_failures\", \"response_time\", \"error_rate\"\n    threshold_value: float\n    action: str  # \"failover\", \"alert\", \"remove\"\n    cooldown_seconds: int = 60\n    enabled: bool = True\n\n\nclass AutoFailoverManager:\n    \"\"\"自動故障轉移管理器\"\"\"\n    \n    def __init__(self, health_check_interval: int = MONITORING_INTERVAL):\n        self.services: Dict[str, List[ServiceEndpoint]] = {}\n        self.failover_rules: List[FailoverRule] = []\n        self.health_check_interval = health_check_interval\n        self.monitoring_active = False\n        self.monitoring_task = None\n        self.failover_history: List[Dict] = []\n        self.metrics_history: Dict[str, List[Dict]] = {}\n        \n        # 故障轉移目錄\n        self.failover_dir = f\"{ENTERPRISE_DIR}/failover\"\n        \n        print(f\"🔄 自動故障轉移管理器已初始化\")\n        print(f\"   ⏱️  健康檢查間隔: {self.health_check_interval}秒\")\n        print(f\"   📁 故障轉移目錄: {self.failover_dir}\")\n    \n    def register_service(self, service_name: str, endpoints: List[ServiceEndpoint]):\n        \"\"\"註冊服務和其端點\"\"\"\n        self.services[service_name] = endpoints\n        self.metrics_history[service_name] = []\n        \n        print(f\"📝 註冊服務: {service_name} ({len(endpoints)} 個端點)\")\n        for endpoint in sorted(endpoints, key=lambda x: x.priority):\n            priority_label = {1: \"主要\", 2: \"次要\", 3: \"備用\"}.get(endpoint.priority, \"其他\")\n            print(f\"   🎯 {endpoint.name}: {endpoint.url} ({priority_label})\")\n    \n    def add_failover_rule(self, rule: FailoverRule):\n        \"\"\"添加故障轉移規則\"\"\"\n        self.failover_rules.append(rule)\n        print(f\"📋 添加故障轉移規則: {rule.name}\")\n        print(f\"   觸發條件: {rule.trigger_condition} > {rule.threshold_value}\")\n        print(f\"   執行動作: {rule.action}\")\n    \n    async def start_monitoring(self):\n        \"\"\"啟動健康監控\"\"\"\n        if self.monitoring_active:\n            print(\"⚠️  監控已在運行中\")\n            return\n        \n        self.monitoring_active = True\n        self.monitoring_task = asyncio.create_task(self._monitoring_loop())\n        print(f\"🏥 啟動健康監控\")\n        print(f\"   監控服務數: {len(self.services)}\")\n        print(f\"   故障轉移規則數: {len(self.failover_rules)}\")\n    \n    async def stop_monitoring(self):\n        \"\"\"停止健康監控\"\"\"\n        self.monitoring_active = False\n        if self.monitoring_task:\n            self.monitoring_task.cancel()\n            try:\n                await self.monitoring_task\n            except asyncio.CancelledError:\n                pass\n        print(\"🛑 健康監控已停止\")\n    \n    async def _monitoring_loop(self):\n        \"\"\"監控循環\"\"\"\n        while self.monitoring_active:\n            try:\n                await self._perform_health_checks()\n                await self._evaluate_failover_rules()\n                await asyncio.sleep(self.health_check_interval)\n            except asyncio.CancelledError:\n                break\n            except Exception as e:\n                print(f\"❌ 監控循環錯誤: {str(e)}\")\n                await asyncio.sleep(5)\n    \n    async def _perform_health_checks(self):\n        \"\"\"執行健康檢查\"\"\"\n        tasks = []\n        for service_name, endpoints in self.services.items():\n            for endpoint in endpoints:\n                task = asyncio.create_task(\n                    self._check_endpoint_health(service_name, endpoint)\n                )\n                tasks.append(task)\n        \n        if tasks:\n            await asyncio.gather(*tasks, return_exceptions=True)\n    \n    async def _check_endpoint_health(self, service_name: str, endpoint: ServiceEndpoint):\n        \"\"\"檢查單個端點健康狀態\"\"\"\n        start_time = time.time()\n        \n        try:\n            # 執行健康檢查\n            success = await self._ping_endpoint(endpoint)\n            response_time = (time.time() - start_time) * 1000\n            \n            # 更新端點狀態\n            endpoint.response_time_ms = response_time\n            endpoint.last_check = datetime.now()\n            \n            if success:\n                # 健康檢查成功\n                old_status = endpoint.health_status\n                \n                # 根據響應時間判斷健康狀態\n                if response_time < 1000:  # 1秒\n                    endpoint.health_status = HealthStatus.HEALTHY\n                elif response_time < 3000:  # 3秒\n                    endpoint.health_status = HealthStatus.WARNING\n                else:\n                    endpoint.health_status = HealthStatus.CRITICAL\n                \n                # 重置失敗計數\n                if endpoint.health_status != HealthStatus.CRITICAL:\n                    endpoint.consecutive_failures = 0\n                \n                # 檢查是否從故障狀態恢復\n                if old_status == HealthStatus.CRITICAL and endpoint.health_status in [\n                    HealthStatus.HEALTHY, HealthStatus.WARNING\n                ]:\n                    print(f\"✅ 端點恢復: {service_name}/{endpoint.name}\")\n                    await self._log_failover_event(service_name, endpoint, \"recovery\")\n            \n            else:\n                # 健康檢查失敗\n                endpoint.consecutive_failures += 1\n                endpoint.health_status = HealthStatus.CRITICAL\n            \n            # 更新成功率\n            self._update_success_rate(endpoint, success)\n            \n            # 記錄指標\n            self._record_metrics(service_name, endpoint, success, response_time)\n            \n        except Exception as e:\n            endpoint.consecutive_failures += 1\n            endpoint.health_status = HealthStatus.UNKNOWN\n            print(f\"❌ 健康檢查異常: {service_name}/{endpoint.name} - {str(e)}\")\n    \n    async def _ping_endpoint(self, endpoint: ServiceEndpoint) -> bool:\n        \"\"\"Ping 端點檢查可用性\"\"\"\n        try:\n            # 根據 URL 類型執行不同的健康檢查\n            if \"triton\" in endpoint.url.lower() or \":8000\" in endpoint.url:\n                return await self._check_triton_endpoint(endpoint.url)\n            else:\n                return await self._check_http_endpoint(endpoint.url)\n        except Exception:\n            return False\n    \n    async def _check_triton_endpoint(self, url: str) -> bool:\n        \"\"\"檢查 Triton 端點\"\"\"\n        try:\n            client = httpclient.InferenceServerClient(url=url)\n            client.is_server_live()\n            client.is_server_ready()\n            return True\n        except Exception:\n            return False\n    \n    async def _check_http_endpoint(self, url: str) -> bool:\n        \"\"\"檢查 HTTP 端點\"\"\"\n        try:\n            # 模擬 HTTP 健康檢查\n            response = requests.get(f\"{url}/health\", timeout=HEALTH_CHECK_TIMEOUT)\n            return response.status_code == 200\n        except Exception:\n            # 如果沒有 /health 端點，嘗試基本連接\n            try:\n                response = requests.get(url, timeout=HEALTH_CHECK_TIMEOUT)\n                return response.status_code < 500\n            except Exception:\n                return False\n    \n    def _update_success_rate(self, endpoint: ServiceEndpoint, success: bool):\n        \"\"\"更新成功率（移動平均）\"\"\"\n        alpha = 0.1  # 平滑因子\n        current_rate = 100.0 if success else 0.0\n        endpoint.success_rate = (\n            alpha * current_rate + (1 - alpha) * endpoint.success_rate\n        )\n    \n    def _record_metrics(self, service_name: str, endpoint: ServiceEndpoint, \n                       success: bool, response_time: float):\n        \"\"\"記錄指標數據\"\"\"\n        metric = {\n            \"timestamp\": datetime.now(),\n            \"endpoint_name\": endpoint.name,\n            \"success\": success,\n            \"response_time_ms\": response_time,\n            \"health_status\": endpoint.health_status.value,\n            \"consecutive_failures\": endpoint.consecutive_failures,\n            \"success_rate\": endpoint.success_rate\n        }\n        \n        self.metrics_history[service_name].append(metric)\n        \n        # 保持歷史記錄大小\n        max_history = 1000\n        if len(self.metrics_history[service_name]) > max_history:\n            self.metrics_history[service_name] = (\n                self.metrics_history[service_name][-max_history:]\n            )\n    \n    async def _evaluate_failover_rules(self):\n        \"\"\"評估故障轉移規則\"\"\"\n        for service_name, endpoints in self.services.items():\n            for endpoint in endpoints:\n                for rule in self.failover_rules:\n                    if rule.enabled and self._should_trigger_rule(endpoint, rule):\n                        await self._execute_failover_action(service_name, endpoint, rule)\n    \n    def _should_trigger_rule(self, endpoint: ServiceEndpoint, rule: FailoverRule) -> bool:\n        \"\"\"檢查是否應該觸發規則\"\"\"\n        if rule.trigger_condition == \"consecutive_failures\":\n            return endpoint.consecutive_failures >= rule.threshold_value\n        elif rule.trigger_condition == \"response_time\":\n            return endpoint.response_time_ms >= rule.threshold_value\n        elif rule.trigger_condition == \"success_rate\":\n            return endpoint.success_rate <= rule.threshold_value\n        elif rule.trigger_condition == \"health_status\":\n            critical_threshold = {\"critical\": 1, \"warning\": 2, \"healthy\": 3}\n            current_level = critical_threshold.get(endpoint.health_status.value, 0)\n            return current_level <= rule.threshold_value\n        \n        return False\n    \n    async def _execute_failover_action(self, service_name: str, \n                                     endpoint: ServiceEndpoint, rule: FailoverRule):\n        \"\"\"執行故障轉移動作\"\"\"\n        if rule.action == \"failover\":\n            await self._perform_failover(service_name, endpoint)\n        elif rule.action == \"alert\":\n            await self._send_alert(service_name, endpoint, rule)\n        elif rule.action == \"remove\":\n            await self._remove_endpoint(service_name, endpoint)\n    \n    async def _perform_failover(self, service_name: str, failed_endpoint: ServiceEndpoint):\n        \"\"\"執行故障轉移\"\"\"\n        print(f\"🔄 執行故障轉移: {service_name}/{failed_endpoint.name}\")\n        \n        # 尋找可用的備用端點\n        available_endpoints = [\n            ep for ep in self.services[service_name]\n            if ep != failed_endpoint and ep.is_available()\n        ]\n        \n        if not available_endpoints:\n            print(f\"🚨 警告: 服務 {service_name} 沒有可用的備用端點\")\n            await self._send_critical_alert(service_name)\n            return\n        \n        # 選擇最高優先級的可用端點\n        backup_endpoint = min(available_endpoints, key=lambda ep: ep.priority)\n        \n        print(f\"🔀 故障轉移到: {service_name}/{backup_endpoint.name}\")\n        print(f\"   失敗端點: {failed_endpoint.name} (優先級 {failed_endpoint.priority})\")\n        print(f\"   備用端點: {backup_endpoint.name} (優先級 {backup_endpoint.priority})\")\n        \n        # 記錄故障轉移事件\n        await self._log_failover_event(service_name, failed_endpoint, \"failover\", backup_endpoint)\n        \n        # 執行實際的故障轉移邏輯（更新負載均衡器等）\n        await self._update_load_balancer(service_name, failed_endpoint, backup_endpoint)\n    \n    async def _log_failover_event(self, service_name: str, endpoint: ServiceEndpoint, \n                                event_type: str, backup_endpoint: ServiceEndpoint = None):\n        \"\"\"記錄故障轉移事件\"\"\"\n        event = {\n            \"timestamp\": datetime.now(),\n            \"service_name\": service_name,\n            \"event_type\": event_type,\n            \"affected_endpoint\": {\n                \"name\": endpoint.name,\n                \"url\": endpoint.url,\n                \"priority\": endpoint.priority,\n                \"consecutive_failures\": endpoint.consecutive_failures,\n                \"response_time_ms\": endpoint.response_time_ms,\n                \"health_status\": endpoint.health_status.value\n            }\n        }\n        \n        if backup_endpoint:\n            event[\"backup_endpoint\"] = {\n                \"name\": backup_endpoint.name,\n                \"url\": backup_endpoint.url,\n                \"priority\": backup_endpoint.priority\n            }\n        \n        self.failover_history.append(event)\n        \n        # 保存到文件\n        log_file = f\"{self.failover_dir}/failover_events.json\"\n        try:\n            with open(log_file, 'w') as f:\n                json.dump(self.failover_history, f, indent=2, default=str)\n        except Exception as e:\n            print(f\"⚠️  無法保存故障轉移日誌: {str(e)}\")\n    \n    async def _update_load_balancer(self, service_name: str, \n                                  failed_endpoint: ServiceEndpoint,\n                                  backup_endpoint: ServiceEndpoint):\n        \"\"\"更新負載均衡器配置\"\"\"\n        try:\n            print(f\"   📝 更新負載均衡器配置\")\n            # 模擬負載均衡器更新\n            await asyncio.sleep(1)\n            \n            print(f\"   ✅ 負載均衡器配置已更新\")\n            \n        except Exception as e:\n            print(f\"   ❌ 負載均衡器更新失敗: {str(e)}\")\n    \n    async def _send_alert(self, service_name: str, endpoint: ServiceEndpoint, rule: FailoverRule):\n        \"\"\"發送告警\"\"\"\n        print(f\"🚨 發送告警: {service_name}/{endpoint.name}\")\n        print(f\"   規則: {rule.name}\")\n        print(f\"   狀態: {endpoint.health_status.value}\")\n    \n    async def _send_critical_alert(self, service_name: str):\n        \"\"\"發送嚴重告警\"\"\"\n        print(f\"🆘 嚴重告警: 服務 {service_name} 所有端點不可用\")\n    \n    async def _remove_endpoint(self, service_name: str, endpoint: ServiceEndpoint):\n        \"\"\"從服務中移除端點\"\"\"\n        print(f\"🗑️  移除端點: {service_name}/{endpoint.name}\")\n        # 在實際實現中，這裡會從負載均衡器中移除端點\n    \n    def get_service_status(self, service_name: str) -> Dict:\n        \"\"\"獲取服務狀態\"\"\"\n        if service_name not in self.services:\n            return {\"error\": f\"服務 {service_name} 不存在\"}\n        \n        endpoints = self.services[service_name]\n        \n        status = {\n            \"service_name\": service_name,\n            \"total_endpoints\": len(endpoints),\n            \"healthy_endpoints\": len([ep for ep in endpoints if ep.health_status == HealthStatus.HEALTHY]),\n            \"warning_endpoints\": len([ep for ep in endpoints if ep.health_status == HealthStatus.WARNING]),\n            \"critical_endpoints\": len([ep for ep in endpoints if ep.health_status == HealthStatus.CRITICAL]),\n            \"unknown_endpoints\": len([ep for ep in endpoints if ep.health_status == HealthStatus.UNKNOWN]),\n            \"primary_endpoint\": None,\n            \"backup_endpoints\": [],\n            \"endpoints_detail\": []\n        }\n        \n        # 分類端點\n        for endpoint in endpoints:\n            if endpoint.priority == 1 and endpoint.is_available():\n                status[\"primary_endpoint\"] = endpoint.name\n            elif endpoint.priority > 1:\n                status[\"backup_endpoints\"].append({\n                    \"name\": endpoint.name,\n                    \"priority\": endpoint.priority,\n                    \"available\": endpoint.is_available()\n                })\n            \n            status[\"endpoints_detail\"].append({\n                \"name\": endpoint.name,\n                \"url\": endpoint.url,\n                \"priority\": endpoint.priority,\n                \"health_status\": endpoint.health_status.value,\n                \"response_time_ms\": endpoint.response_time_ms,\n                \"success_rate\": endpoint.success_rate,\n                \"consecutive_failures\": endpoint.consecutive_failures,\n                \"last_check\": endpoint.last_check.isoformat()\n            })\n        \n        return status\n\n\n# 創建自動故障轉移管理器\nfailover_manager = AutoFailoverManager()\nprint(\"🔄 自動故障轉移管理器已建立\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

### 2.1 故障轉移架構設計

## 🎯 實驗 2：自動故障轉移系統

In [None]:
# 執行藍綠部署演示\nasync def demonstrate_blue_green_deployment():\n    \"\"\"演示藍綠部署流程\"\"\"\n    print(\"🚀 開始藍綠部署演示\")\n    print(\"=\" * 60)\n    \n    # 顯示部署前狀態\n    print(\"\\n📊 部署前狀態:\")\n    active_version = hot_update_manager.get_active_version(\"text_classifier\")\n    if active_version:\n        print(f\"   當前活躍版本: v{active_version.version}\")\n        print(f\"   版本狀態: {active_version.status}\")\n    \n    print(f\"\\n🎯 部署目標: 將 text_classifier 從 v2.0.0 升級到 v3.0.0\")\n    \n    # 執行藍綠部署\n    success = await blue_green_deployment.deploy_new_version(\n        model_name=\"text_classifier\",\n        target_version=\"3.0.0\"\n    )\n    \n    print(\"\\n\" + \"=\" * 60)\n    if success:\n        print(\"🎉 藍綠部署演示完成\")\n        \n        # 顯示部署後狀態\n        print(\"\\n📊 部署後狀態:\")\n        status = blue_green_deployment.get_deployment_status()\n        for key, value in status.items():\n            print(f\"   {key}: {value}\")\n        \n        # 更新模型版本狀態\n        versions = hot_update_manager.get_model_versions(\"text_classifier\")\n        versions[\"3.0.0\"].status = \"active\"\n        versions[\"2.0.0\"].status = \"deprecated\"\n        \n        print(f\"\\n✅ 模型版本狀態已更新\")\n        \n    else:\n        print(\"❌ 藍綠部署演示失敗\")\n\n\n# 執行演示\nawait demonstrate_blue_green_deployment()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

### 1.4 執行藍綠部署演示

In [None]:
# 擴展 HotUpdateManager 以支援藍綠部署\nclass BlueGreenDeployment:\n    \"\"\"藍綠部署實現\"\"\"\n    \n    def __init__(self, hot_update_manager: HotUpdateManager):\n        self.manager = hot_update_manager\n        self.deployment_state = {\n            \"active_environment\": \"blue\",  # blue 或 green\n            \"blue_version\": None,\n            \"green_version\": None,\n            \"switch_in_progress\": False\n        }\n    \n    async def deploy_new_version(self, model_name: str, target_version: str) -> bool:\n        \"\"\"執行藍綠部署\"\"\"\n        try:\n            print(f\"🔵🟢 開始藍綠部署: {model_name} -> v{target_version}\")\n            \n            # 1. 驗證部署準備狀態\n            is_ready, message = self.manager.validate_deployment_readiness(model_name, target_version)\n            if not is_ready:\n                print(f\"❌ 部署前檢查失敗: {message}\")\n                return False\n            \n            # 2. 創建備份\n            print(f\"💾 步驟 1/6: 創建當前版本備份\")\n            if not self.manager.create_backup(model_name):\n                print(f\"❌ 備份失敗，終止部署\")\n                return False\n            \n            # 3. 確定部署環境\n            current_env = self.deployment_state[\"active_environment\"]\n            target_env = \"green\" if current_env == \"blue\" else \"blue\"\n            \n            print(f\"🎯 步驟 2/6: 部署環境規劃\")\n            print(f\"   當前環境: {current_env}\")\n            print(f\"   目標環境: {target_env}\")\n            \n            # 4. 在目標環境部署新版本\n            print(f\"📦 步驟 3/6: 在 {target_env} 環境部署 v{target_version}\")\n            if not await self._deploy_to_environment(model_name, target_version, target_env):\n                print(f\"❌ 部署到 {target_env} 環境失敗\")\n                return False\n            \n            # 5. 健康檢查\n            print(f\"🏥 步驟 4/6: 健康檢查\")\n            if not await self._health_check_environment(model_name, target_env):\n                print(f\"❌ {target_env} 環境健康檢查失敗\")\n                await self._cleanup_environment(target_env)\n                return False\n            \n            # 6. 流量切換\n            print(f\"🔄 步驟 5/6: 執行流量切換\")\n            if not await self._switch_traffic(current_env, target_env):\n                print(f\"❌ 流量切換失敗\")\n                await self._cleanup_environment(target_env)\n                return False\n            \n            # 7. 清理舊環境\n            print(f\"🧹 步驟 6/6: 清理 {current_env} 環境\")\n            await self._cleanup_environment(current_env)\n            \n            # 更新狀態\n            self.deployment_state[\"active_environment\"] = target_env\n            self.deployment_state[f\"{target_env}_version\"] = target_version\n            self.deployment_state[f\"{current_env}_version\"] = None\n            \n            # 記錄部署歷史\n            self.manager.deployment_history.append({\n                \"timestamp\": datetime.now(),\n                \"action\": \"blue_green_deployment\",\n                \"model_name\": model_name,\n                \"source_version\": self.manager.get_active_version(model_name).version,\n                \"target_version\": target_version,\n                \"source_environment\": current_env,\n                \"target_environment\": target_env,\n                \"status\": \"success\"\n            })\n            \n            print(f\"✅ 藍綠部署成功完成\")\n            print(f\"   🎯 當前活躍環境: {target_env}\")\n            print(f\"   📦 當前活躍版本: v{target_version}\")\n            \n            return True\n            \n        except Exception as e:\n            print(f\"❌ 藍綠部署失敗: {str(e)}\")\n            return False\n    \n    async def _deploy_to_environment(self, model_name: str, version: str, environment: str) -> bool:\n        \"\"\"部署到指定環境\"\"\"\n        try:\n            print(f\"   📋 準備 {environment} 環境配置\")\n            \n            # 模擬配置準備\n            await asyncio.sleep(1)\n            \n            print(f\"   📥 載入模型 v{version} 到 {environment} 環境\")\n            \n            # 模擬模型載入\n            await asyncio.sleep(2)\n            \n            print(f\"   ⚙️  配置 {environment} 環境路由\")\n            \n            # 模擬路由配置\n            await asyncio.sleep(1)\n            \n            return True\n            \n        except Exception as e:\n            print(f\"❌ 部署到 {environment} 環境失敗: {str(e)}\")\n            return False\n    \n    async def _health_check_environment(self, model_name: str, environment: str) -> bool:\n        \"\"\"環境健康檢查\"\"\"\n        try:\n            print(f\"   🔍 檢查 {environment} 環境服務狀態\")\n            \n            # 模擬服務狀態檢查\n            await asyncio.sleep(1)\n            \n            print(f\"   🚀 執行 {environment} 環境推理測試\")\n            \n            # 模擬推理測試\n            for i in range(3):\n                print(f\"      測試 {i+1}/3: \", end=\"\")\n                await asyncio.sleep(0.5)\n                \n                # 模擬測試結果\n                latency = np.random.uniform(70, 90)\n                success = np.random.random() > 0.05  # 95% 成功率\n                \n                if success:\n                    print(f\"✅ 成功 (延遲: {latency:.1f}ms)\")\n                else:\n                    print(f\"❌ 失敗\")\n                    return False\n            \n            print(f\"   📊 {environment} 環境性能驗證通過\")\n            return True\n            \n        except Exception as e:\n            print(f\"❌ {environment} 環境健康檢查失敗: {str(e)}\")\n            return False\n    \n    async def _switch_traffic(self, source_env: str, target_env: str) -> bool:\n        \"\"\"執行流量切換\"\"\"\n        try:\n            self.deployment_state[\"switch_in_progress\"] = True\n            \n            print(f\"   ⏸️  暫停新請求路由\")\n            await asyncio.sleep(0.5)\n            \n            print(f\"   ⏳ 等待現有請求完成\")\n            await asyncio.sleep(1)\n            \n            print(f\"   🔄 更新負載均衡器配置\")\n            print(f\"      {source_env} -> {target_env}\")\n            await asyncio.sleep(1)\n            \n            print(f\"   ▶️  恢復請求路由\")\n            await asyncio.sleep(0.5)\n            \n            print(f\"   🔍 驗證流量切換\")\n            # 模擬流量驗證\n            for i in range(3):\n                await asyncio.sleep(0.3)\n                print(f\"      驗證 {i+1}/3: ✅ 流量正常\")\n            \n            self.deployment_state[\"switch_in_progress\"] = False\n            return True\n            \n        except Exception as e:\n            print(f\"❌ 流量切換失敗: {str(e)}\")\n            self.deployment_state[\"switch_in_progress\"] = False\n            return False\n    \n    async def _cleanup_environment(self, environment: str):\n        \"\"\"清理環境\"\"\"\n        try:\n            print(f\"   🧹 清理 {environment} 環境\")\n            print(f\"      卸載舊模型\")\n            await asyncio.sleep(0.5)\n            \n            print(f\"      清理暫存文件\")\n            await asyncio.sleep(0.5)\n            \n            print(f\"      釋放資源\")\n            await asyncio.sleep(0.5)\n            \n            print(f\"   ✅ {environment} 環境清理完成\")\n            \n        except Exception as e:\n            print(f\"⚠️  清理 {environment} 環境時出現警告: {str(e)}\")\n    \n    def get_deployment_status(self) -> Dict:\n        \"\"\"獲取部署狀態\"\"\"\n        return {\n            \"active_environment\": self.deployment_state[\"active_environment\"],\n            \"blue_version\": self.deployment_state[\"blue_version\"],\n            \"green_version\": self.deployment_state[\"green_version\"],\n            \"switch_in_progress\": self.deployment_state[\"switch_in_progress\"],\n            \"deployment_history_count\": len(self.manager.deployment_history)\n        }\n\n\n# 創建藍綠部署實例\nblue_green_deployment = BlueGreenDeployment(hot_update_manager)\nprint(\"🔵🟢 藍綠部署管理器已初始化\")\n\n# 顯示當前部署狀態\nstatus = blue_green_deployment.get_deployment_status()\nprint(f\"\\n📊 當前部署狀態:\")\nfor key, value in status.items():\n    print(f\"   {key}: {value}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

### 1.3 藍綠部署實現

In [None]:
# 顯示模型版本概覽\ndef display_model_versions_overview(manager: HotUpdateManager, model_name: str):\n    \"\"\"顯示模型版本概覽\"\"\"\n    versions = manager.get_model_versions(model_name)\n    \n    if not versions:\n        print(f\"❌ 模型 '{model_name}' 沒有已註冊的版本\")\n        return\n    \n    print(f\"\\n📋 模型版本概覽: {model_name}\")\n    print(\"=\" * 80)\n    \n    # 排序版本（按創建時間）\n    sorted_versions = sorted(versions.values(), key=lambda x: x.created_at, reverse=True)\n    \n    for version in sorted_versions:\n        status_icon = {\n            \"active\": \"🟢\",\n            \"inactive\": \"🔵\", \n            \"loading\": \"🟡\",\n            \"deprecated\": \"🔴\"\n        }.get(version.status, \"⚪\")\n        \n        print(f\"\\n{status_icon} 版本 {version.version} ({version.status.upper()})\")\n        print(f\"   📅 創建時間: {version.created_at.strftime('%Y-%m-%d %H:%M:%S')}\")\n        print(f\"   🏷️  框架: {version.metadata.get('framework', 'N/A')}\")\n        print(f\"   ⚡ 精度: {version.metadata.get('precision', 'N/A')}\")\n        print(f\"   📊 健康分數: {version.health_score:.1f}/100\")\n        \n        if version.performance_metrics:\n            print(f\"   📈 性能指標:\")\n            for metric, value in version.performance_metrics.items():\n                if 'latency' in metric:\n                    print(f\"      • {metric}: {value:.1f}ms\")\n                elif 'throughput' in metric:\n                    print(f\"      • {metric}: {value:.0f} QPS\")\n                else:\n                    print(f\"      • {metric}: {value}\")\n    \n    # 統計摘要\n    active_count = sum(1 for v in versions.values() if v.status == \"active\")\n    inactive_count = sum(1 for v in versions.values() if v.status == \"inactive\")\n    deprecated_count = sum(1 for v in versions.values() if v.status == \"deprecated\")\n    \n    print(f\"\\n📊 版本統計:\")\n    print(f\"   🟢 活躍: {active_count}\")\n    print(f\"   🔵 待命: {inactive_count}\")\n    print(f\"   🔴 已廢棄: {deprecated_count}\")\n    print(f\"   📦 總計: {len(versions)}\")\n\n\n# 可視化版本性能比較\ndef plot_version_performance_comparison(manager: HotUpdateManager, model_name: str):\n    \"\"\"可視化版本性能比較\"\"\"\n    versions = manager.get_model_versions(model_name)\n    \n    if not versions:\n        print(f\"❌ 模型 '{model_name}' 沒有版本數據\")\n        return\n    \n    # 準備數據\n    version_labels = []\n    latencies = []\n    throughputs = []\n    health_scores = []\n    status_colors = []\n    \n    color_map = {\n        \"active\": \"#2ecc71\",\n        \"inactive\": \"#3498db\", \n        \"loading\": \"#f39c12\",\n        \"deprecated\": \"#e74c3c\"\n    }\n    \n    sorted_versions = sorted(versions.values(), key=lambda x: x.version)\n    \n    for version in sorted_versions:\n        version_labels.append(f\"v{version.version}\")\n        latencies.append(version.performance_metrics.get(\"latency_p99\", 0))\n        throughputs.append(version.performance_metrics.get(\"throughput\", 0))\n        health_scores.append(version.health_score)\n        status_colors.append(color_map.get(version.status, \"#95a5a6\"))\n    \n    # 創建圖表\n    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))\n    \n    # P99 延遲比較\n    bars1 = ax1.bar(version_labels, latencies, color=status_colors, alpha=0.8)\n    ax1.set_title(f'{model_name} - P99 延遲比較', fontsize=14, fontweight='bold')\n    ax1.set_ylabel('延遲 (ms)')\n    ax1.grid(True, alpha=0.3)\n    \n    # 添加數值標籤\n    for bar, val in zip(bars1, latencies):\n        if val > 0:\n            ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,\n                    f'{val:.1f}', ha='center', va='bottom', fontweight='bold')\n    \n    # 吞吐量比較\n    bars2 = ax2.bar(version_labels, throughputs, color=status_colors, alpha=0.8)\n    ax2.set_title(f'{model_name} - 吞吐量比較', fontsize=14, fontweight='bold')\n    ax2.set_ylabel('吞吐量 (QPS)')\n    ax2.grid(True, alpha=0.3)\n    \n    for bar, val in zip(bars2, throughputs):\n        if val > 0:\n            ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 20,\n                    f'{val:.0f}', ha='center', va='bottom', fontweight='bold')\n    \n    # 健康分數\n    bars3 = ax3.bar(version_labels, health_scores, color=status_colors, alpha=0.8)\n    ax3.set_title(f'{model_name} - 健康分數', fontsize=14, fontweight='bold')\n    ax3.set_ylabel('健康分數')\n    ax3.set_ylim(0, 105)\n    ax3.grid(True, alpha=0.3)\n    \n    for bar, val in zip(bars3, health_scores):\n        ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,\n                f'{val:.1f}', ha='center', va='bottom', fontweight='bold')\n    \n    # 版本狀態分布\n    status_counts = {}\n    for version in sorted_versions:\n        status_counts[version.status] = status_counts.get(version.status, 0) + 1\n    \n    status_labels = list(status_counts.keys())\n    status_values = list(status_counts.values())\n    status_colors_pie = [color_map.get(status, \"#95a5a6\") for status in status_labels]\n    \n    wedges, texts, autotexts = ax4.pie(status_values, labels=status_labels,\n                                      autopct='%1.0f%%', colors=status_colors_pie,\n                                      startangle=90)\n    ax4.set_title(f'{model_name} - 版本狀態分布', fontsize=14, fontweight='bold')\n    \n    # 添加圖例\n    legend_elements = []\n    for status, color in color_map.items():\n        if status in [v.status for v in sorted_versions]:\n            legend_elements.append(plt.Rectangle((0,0),1,1, facecolor=color, alpha=0.8, label=status.title()))\n    \n    if legend_elements:\n        fig.legend(handles=legend_elements, loc='upper center', bbox_to_anchor=(0.5, 0.02), ncol=len(legend_elements))\n    \n    plt.suptitle(f'模型版本性能分析: {model_name}', fontsize=16, fontweight='bold')\n    plt.tight_layout()\n    plt.subplots_adjust(bottom=0.1)\n    plt.show()\n\n\n# 顯示版本概覽\ndisplay_model_versions_overview(hot_update_manager, \"text_classifier\")\n\n# 可視化性能比較\nplot_version_performance_comparison(hot_update_manager, \"text_classifier\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

### 1.2 版本管理可視化

In [None]:
class DeploymentStrategy(Enum):
    \"\"\"部署策略枚舉\"\"\"\n    BLUE_GREEN = \"blue_green\"\n    CANARY = \"canary\"\n    ROLLING = \"rolling\"\n    A_B_TEST = \"ab_test\"\n\n\n@dataclass\nclass ModelVersion:\n    \"\"\"模型版本信息\"\"\"\n    name: str\n    version: str\n    created_at: datetime\n    status: str  # \"inactive\", \"loading\", \"active\", \"deprecated\"\n    config_path: str\n    model_path: str\n    metadata: Dict[str, Any] = field(default_factory=dict)\n    performance_metrics: Dict[str, float] = field(default_factory=dict)\n    health_score: float = 100.0\n\n\n@dataclass\nclass DeploymentConfig:\n    \"\"\"部署配置\"\"\"\n    strategy: DeploymentStrategy\n    source_version: str\n    target_version: str\n    traffic_split: float  # 目標版本的流量百分比\n    rollback_threshold: Dict[str, float]\n    health_check_interval: int\n    max_deployment_time: int  # 秒\n    auto_rollback: bool = True\n\n\nclass HotUpdateManager:\n    \"\"\"模型熱更新管理器\"\"\"\n    \n    def __init__(self, triton_url: str = TRITON_URL):\n        self.triton_url = triton_url\n        self.client = httpclient.InferenceServerClient(url=triton_url)\n        self.versions: Dict[str, Dict[str, ModelVersion]] = {}  # model_name -> {version -> ModelVersion}\n        self.active_deployments: Dict[str, DeploymentConfig] = {}\n        self.deployment_history: List[Dict] = []\n        \n        # 初始化目錄\n        self.hot_update_dir = f\"{ENTERPRISE_DIR}/hot_updates\"\n        self.backup_dir = f\"{ENTERPRISE_DIR}/backups\"\n        \n        print(f\"🔥 熱更新管理器已初始化\")\n        print(f\"   📊 Triton URL: {self.triton_url}\")\n        print(f\"   📁 更新目錄: {self.hot_update_dir}\")\n    \n    def register_model_version(self, model_name: str, version: ModelVersion) -> bool:\n        \"\"\"註冊新的模型版本\"\"\"\n        try:\n            if model_name not in self.versions:\n                self.versions[model_name] = {}\n            \n            self.versions[model_name][version.version] = version\n            \n            print(f\"✅ 已註冊模型版本: {model_name} v{version.version}\")\n            print(f\"   📅 創建時間: {version.created_at.strftime('%Y-%m-%d %H:%M:%S')}\")\n            print(f\"   🏷️  狀態: {version.status}\")\n            \n            return True\n            \n        except Exception as e:\n            print(f\"❌ 註冊模型版本失敗: {str(e)}\")\n            return False\n    \n    def get_model_versions(self, model_name: str) -> Dict[str, ModelVersion]:\n        \"\"\"獲取模型的所有版本\"\"\"\n        return self.versions.get(model_name, {})\n    \n    def get_active_version(self, model_name: str) -> Optional[ModelVersion]:\n        \"\"\"獲取當前活躍版本\"\"\"\n        versions = self.get_model_versions(model_name)\n        for version in versions.values():\n            if version.status == \"active\":\n                return version\n        return None\n    \n    def validate_deployment_readiness(self, model_name: str, target_version: str) -> Tuple[bool, str]:\n        \"\"\"驗證部署準備狀態\"\"\"\n        try:\n            # 檢查版本是否存在\n            if model_name not in self.versions:\n                return False, f\"模型 {model_name} 不存在\"\n            \n            if target_version not in self.versions[model_name]:\n                return False, f\"版本 {target_version} 不存在\"\n            \n            target = self.versions[model_name][target_version]\n            \n            # 檢查模型文件\n            if not os.path.exists(target.model_path):\n                return False, f\"模型文件不存在: {target.model_path}\"\n            \n            # 檢查配置文件\n            if not os.path.exists(target.config_path):\n                return False, f\"配置文件不存在: {target.config_path}\"\n            \n            # 檢查是否有其他部署正在進行\n            if self.active_deployments:\n                return False, f\"已有部署正在進行: {list(self.active_deployments.keys())}\"\n            \n            return True, \"部署準備就緒\"\n            \n        except Exception as e:\n            return False, f\"驗證失敗: {str(e)}\"\n    \n    def create_backup(self, model_name: str) -> bool:\n        \"\"\"創建當前版本的備份\"\"\"\n        try:\n            active_version = self.get_active_version(model_name)\n            if not active_version:\n                print(f\"⚠️  模型 {model_name} 沒有活躍版本，跳過備份\")\n                return True\n            \n            timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n            backup_name = f\"{model_name}_v{active_version.version}_{timestamp}\"\n            backup_path = f\"{self.backup_dir}/{backup_name}\"\n            \n            # 創建備份目錄\n            os.makedirs(backup_path, exist_ok=True)\n            \n            # 備份模型文件（這裡簡化處理）\n            print(f\"💾 創建備份: {backup_name}\")\n            \n            # 保存備份信息\n            backup_info = {\n                \"model_name\": model_name,\n                \"version\": active_version.version,\n                \"backup_time\": timestamp,\n                \"backup_path\": backup_path,\n                \"original_config\": active_version.config_path,\n                \"original_model\": active_version.model_path\n            }\n            \n            with open(f\"{backup_path}/backup_info.json\", 'w') as f:\n                json.dump(backup_info, f, indent=2)\n            \n            print(f\"✅ 備份完成: {backup_path}\")\n            return True\n            \n        except Exception as e:\n            print(f\"❌ 備份失敗: {str(e)}\")\n            return False\n\n\n# 創建熱更新管理器實例\nhot_update_manager = HotUpdateManager()\n\n# 模擬註冊幾個模型版本\nmodel_versions = [\n    ModelVersion(\n        name=\"text_classifier\",\n        version=\"1.0.0\",\n        created_at=datetime.now() - timedelta(days=30),\n        status=\"deprecated\",\n        config_path=\"/models/text_classifier/1/config.pbtxt\",\n        model_path=\"/models/text_classifier/1/model.plan\",\n        metadata={\"framework\": \"tensorrt\", \"precision\": \"fp32\"},\n        performance_metrics={\"latency_p99\": 120.5, \"throughput\": 850.0}\n    ),\n    ModelVersion(\n        name=\"text_classifier\",\n        version=\"2.0.0\",\n        created_at=datetime.now() - timedelta(days=7),\n        status=\"active\",\n        config_path=\"/models/text_classifier/2/config.pbtxt\",\n        model_path=\"/models/text_classifier/2/model.plan\",\n        metadata={\"framework\": \"tensorrt\", \"precision\": \"fp16\"},\n        performance_metrics={\"latency_p99\": 95.2, \"throughput\": 1200.0}\n    ),\n    ModelVersion(\n        name=\"text_classifier\",\n        version=\"3.0.0\",\n        created_at=datetime.now() - timedelta(hours=2),\n        status=\"inactive\",\n        config_path=\"/models/text_classifier/3/config.pbtxt\",\n        model_path=\"/models/text_classifier/3/model.plan\",\n        metadata={\"framework\": \"tensorrt\", \"precision\": \"int8\"},\n        performance_metrics={\"latency_p99\": 75.8, \"throughput\": 1450.0}\n    )\n]\n\n# 註冊模型版本\nfor version in model_versions:\n    hot_update_manager.register_model_version(version.name, version)\n\nprint(f\"\\n📊 已註冊 {len(model_versions)} 個模型版本\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

### 1.1 熱更新系統設計

## 🎯 實驗 1：模型熱更新系統

# Lab 2.4.4 - 企業級高級特性

## 🎯 實驗目標

本實驗將教您如何：
1. 實現零停機時間的模型熱更新系統
2. 構建自動故障轉移和高可用性架構
3. 設計智能流量整形和負載均衡策略
4. 建立企業級監控和告警體系
5. 實現業務連續性保障機制

## 📋 前置需求

- 完成 Lab 2.1-2.3（Triton 基礎到性能優化）
- 熟悉容器編排和微服務架構
- 了解企業級運維和 DevOps 實踐

---

## 📚 理論背景

### 企業級部署挑戰

**1. 業務連續性要求**
- 99.9% 以上的服務可用性
- 零停機時間的系統更新
- 快速故障恢復和災難響應

**2. 風險控制需求**
- 多層次的健康檢查機制
- 自動化的故障轉移策略
- 全面的監控和告警體系

**3. 運維效率提升**
- 智能化的流量管理
- 自動化的運維流程
- 數據驅動的決策支持

### 企業級架構設計

```mermaid
graph TD
    A[Load Balancer] --> B[Primary Cluster]
    A --> C[Secondary Cluster]
    
    B --> D[Hot Update Manager]
    B --> E[Health Monitor]
    C --> F[Backup Services]
    
    D --> G[Blue Environment]
    D --> H[Green Environment]
    
    E --> I[Metrics Collector]
    E --> J[Alert Manager]
    
    I --> K[Monitoring Dashboard]
    J --> L[Incident Response]
```

## 🛠️ 環境準備

In [None]:
import os
import json
import time
import asyncio
import threading
import subprocess
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass, field
from enum import Enum
from concurrent.futures import ThreadPoolExecutor
from contextlib import asynccontextmanager

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests

# Triton 客戶端
import tritonclient.http as httpclient
import tritonclient.grpc as grpcclient
from tritonclient.utils import InferenceServerException

# 企業級功能支援庫
try:
    import redis
    REDIS_AVAILABLE = True
    print("✅ Redis 客戶端可用 (狀態管理)")
except ImportError:
    REDIS_AVAILABLE = False
    print("⚠️  Redis 客戶端不可用")

try:
    import prometheus_client
    from prometheus_client import Counter, Histogram, Gauge
    PROMETHEUS_AVAILABLE = True
    print("✅ Prometheus 客戶端可用 (指標收集)")
except ImportError:
    PROMETHEUS_AVAILABLE = False
    print("⚠️  Prometheus 客戶端不可用")

# 設置可視化樣式
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print(f"\n🔧 環境準備完成 - {datetime.now()}")
print(f"📁 工作目錄: {os.getcwd()}")

In [None]:
# 設置企業級實驗環境
BASE_DIR = "/opt/tritonserver"
MODEL_REPO = f"{BASE_DIR}/models"
ENTERPRISE_DIR = f"{BASE_DIR}/enterprise"

# 創建企業級目錄結構
directories = [
    f"{ENTERPRISE_DIR}/hot_updates",
    f"{ENTERPRISE_DIR}/failover",
    f"{ENTERPRISE_DIR}/monitoring",
    f"{ENTERPRISE_DIR}/configs",
    f"{ENTERPRISE_DIR}/logs",
    f"{ENTERPRISE_DIR}/backups"
]

for directory in directories:
    os.makedirs(directory, exist_ok=True)

print(f"📁 企業級目錄結構:")
for directory in directories:
    print(f"   {directory}")

# 全域配置
TRITON_URL = "localhost:8000"
MONITORING_INTERVAL = 10  # 秒
HEALTH_CHECK_TIMEOUT = 5  # 秒

print(f"\n⚙️  系統配置:")
print(f"   Triton URL: {TRITON_URL}")
print(f"   監控間隔: {MONITORING_INTERVAL}秒")
print(f"   健康檢查超時: {HEALTH_CHECK_TIMEOUT}秒")