# Lab-1.2: PyTorch DDP 分散式訓練基礎 - 01-Setup
## 環境設置與基礎概念

---

## ⚠️ 注意事項

本實驗室在**單GPU環境**中提供**概念演示**和**配置學習**。
- ✅ **可學習**: DDP 原理、配置方法、代碼結構
- ⚠️ **限制**: 無法展示真正的多GPU通訊和加速效果

---

## 📚 學習目標

1. 理解 PyTorch DDP 的基本概念
2. 掌握分散式訓練的環境配置
3. 學習進程初始化和通訊設置
4. 準備訓練數據和模型

## 1. 環境檢查

In [None]:
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler
import os
import sys

print(f"PyTorch 版本: {torch.__version__}")
print(f"CUDA 可用: {torch.cuda.is_available()}")
print(f"GPU 數量: {torch.cuda.device_count()}")
print(f"分散式訓練支援: {torch.distributed.is_available()}")
print(f"NCCL 後端支援: {torch.distributed.is_nccl_available()}")
print(f"Gloo 後端支援: {torch.distributed.is_gloo_available()}")

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"  記憶體: {torch.cuda.get_device_properties(i).total_memory / 1e9:.1f} GB")

## 2. DDP 核心概念

In [None]:
# DDP 關鍵概念演示
print("=== PyTorch DDP 核心概念 ===")
print()

print("1. 基本術語:")
print("   - World Size: 總的進程數量 (通常等於GPU數量)")
print("   - Rank: 每個進程的唯一標識符 (0 到 world_size-1)")
print("   - Local Rank: 節點內的GPU編號")
print("   - Backend: 通訊後端 (NCCL for GPU, Gloo for CPU)")
print()

print("2. DDP 工作流程:")
print("   Step 1: 初始化進程組 (init_process_group)")
print("   Step 2: 設置本地設備 (cuda.set_device)")
print("   Step 3: 包裝模型為DDP (DistributedDataParallel)")
print("   Step 4: 使用DistributedSampler分割數據")
print("   Step 5: 訓練 (自動梯度同步)")
print("   Step 6: 清理 (destroy_process_group)")
print()

print("3. 關鍵優勢:")
print("   - 梯度自動同步 (All-Reduce)")
print("   - 參數一致性保證")
print("   - 高效的通訊優化")
print("   - 容錯和故障恢復")

## 3. 進程初始化函數

In [None]:
def setup_ddp(rank, world_size, backend='nccl'):
    """
    初始化分散式訓練環境
    
    Args:
        rank: 當前進程編號
        world_size: 總進程數
        backend: 通訊後端 ('nccl' for GPU, 'gloo' for CPU)
    """
    print(f"[Rank {rank}] 初始化進程組...")
    
    # 設置環境變數
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    
    # 初始化進程組
    dist.init_process_group(
        backend=backend,
        rank=rank,
        world_size=world_size
    )
    
    # 設置CUDA設備
    if torch.cuda.is_available() and backend == 'nccl':
        torch.cuda.set_device(rank)
        print(f"[Rank {rank}] 使用 GPU {rank}")
    else:
        print(f"[Rank {rank}] 使用 CPU")
    
    print(f"[Rank {rank}] 進程組初始化完成")

def cleanup_ddp():
    """
    清理分散式訓練環境
    """
    dist.destroy_process_group()
    print("進程組已清理")

# 單GPU環境的模擬設置
def setup_single_gpu_demo():
    """
    單GPU環境的DDP概念演示
    """
    print("=== 單GPU環境 DDP 概念演示 ===")
    print("注意: 這是概念演示，不會啟動真正的多進程")
    print()
    
    # 模擬參數
    world_size = 1  # 在多GPU環境中，這會是GPU數量
    rank = 0        # 在多GPU環境中，每個進程有不同的rank
    
    print(f"模擬配置:")
    print(f"  World Size: {world_size}")
    print(f"  Rank: {rank}")
    print(f"  Backend: {'nccl' if torch.cuda.is_available() else 'gloo'}")
    print(f"  Device: {'cuda:0' if torch.cuda.is_available() else 'cpu'}")
    
    return world_size, rank

world_size, rank = setup_single_gpu_demo()

## 4. 示例模型定義

In [None]:
import torch.nn as nn

class SimpleTransformer(nn.Module):
    """
    簡化的Transformer模型，用於DDP演示
    """
    def __init__(self, vocab_size=10000, d_model=512, nhead=8, num_layers=6, max_seq_len=256):
        super().__init__()
        self.d_model = d_model
        self.max_seq_len = max_seq_len
        
        # Embedding layers
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = nn.Embedding(max_seq_len, d_model)
        
        # Transformer layers
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=d_model * 4,
            dropout=0.1,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        
        # Output layer
        self.output_proj = nn.Linear(d_model, vocab_size)
        
        # Initialize weights
        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
    
    def forward(self, input_ids, attention_mask=None):
        batch_size, seq_len = input_ids.shape
        
        # Embeddings
        positions = torch.arange(seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, -1)
        x = self.embedding(input_ids) + self.pos_encoding(positions)
        
        # 處理attention mask
        if attention_mask is not None:
            # 轉換為transformer期望的格式
            attention_mask = attention_mask.bool()
            attention_mask = ~attention_mask  # 反轉：True表示masked
        
        # Transformer
        x = self.transformer(x, src_key_padding_mask=attention_mask)
        
        # Output projection
        logits = self.output_proj(x)
        
        return logits

# 創建模型實例
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SimpleTransformer(
    vocab_size=8000,
    d_model=256,  # 較小的模型以適應單GPU
    nhead=8,
    num_layers=4,
    max_seq_len=128
).to(device)

print(f"模型已創建，設備: {device}")
print(f"模型參數量: {sum(p.numel() for p in model.parameters()):,}")
print(f"可訓練參數量: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

## 5. 數據集準備

In [None]:
import torch.utils.data as data
import numpy as np

class DummyTextDataset(data.Dataset):
    """
    模擬文本數據集，用於DDP訓練演示
    """
    def __init__(self, num_samples=1000, seq_len=128, vocab_size=8000):
        self.num_samples = num_samples
        self.seq_len = seq_len
        self.vocab_size = vocab_size
        
        # 生成隨機數據（實際使用中會載入真實數據）
        np.random.seed(42)  # 確保可重現
        self.data = np.random.randint(1, vocab_size, (num_samples, seq_len))
        
        # 生成attention mask (隨機mask一些token)
        self.attention_masks = np.ones((num_samples, seq_len))
        for i in range(num_samples):
            # 隨機選擇序列的實際長度
            actual_len = np.random.randint(seq_len // 2, seq_len + 1)
            self.attention_masks[i, actual_len:] = 0
    
    def __len__(self):
        return self.num_samples
    
    def __getitem__(self, idx):
        input_ids = torch.tensor(self.data[idx], dtype=torch.long)
        attention_mask = torch.tensor(self.attention_masks[idx], dtype=torch.long)
        
        # 對於語言模型，labels通常是input_ids向右移動一位
        labels = torch.cat([input_ids[1:], torch.tensor([0])], dim=0)
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

def create_dataloader(dataset, batch_size=8, shuffle=True, num_workers=2, is_distributed=False, rank=0, world_size=1):
    """
    創建DataLoader，支援分散式訓練
    """
    if is_distributed:
        # 分散式訓練使用DistributedSampler
        sampler = DistributedSampler(
            dataset,
            num_replicas=world_size,
            rank=rank,
            shuffle=shuffle
        )
        dataloader = data.DataLoader(
            dataset,
            batch_size=batch_size,
            sampler=sampler,
            num_workers=num_workers,
            pin_memory=True
        )
        print(f"[Rank {rank}] 使用DistributedSampler")
    else:
        # 標準訓練
        dataloader = data.DataLoader(
            dataset,
            batch_size=batch_size,
            shuffle=shuffle,
            num_workers=num_workers,
            pin_memory=True if torch.cuda.is_available() else False
        )
        print("使用標準DataLoader")
    
    return dataloader

# 創建數據集
train_dataset = DummyTextDataset(num_samples=2000, seq_len=128)
val_dataset = DummyTextDataset(num_samples=400, seq_len=128)

print(f"訓練數據集大小: {len(train_dataset)}")
print(f"驗證數據集大小: {len(val_dataset)}")

# 檢查數據格式
sample = train_dataset[0]
print(f"\n數據格式檢查:")
for key, value in sample.items():
    print(f"  {key}: shape={value.shape}, dtype={value.dtype}")
    print(f"    範例: {value[:10]}...")

## 6. DDP 模型包裝演示

In [None]:
def wrap_model_for_ddp(model, device_id=None, find_unused_parameters=False):
    """
    將模型包裝為DDP模型
    
    Args:
        model: 要包裝的模型
        device_id: GPU設備ID
        find_unused_parameters: 是否查找未使用的參數
    """
    if torch.cuda.device_count() > 1 and dist.is_initialized():
        # 真正的多GPU DDP包裝
        ddp_model = DDP(
            model,
            device_ids=[device_id] if device_id is not None else None,
            find_unused_parameters=find_unused_parameters
        )
        print(f"模型已包裝為DDP，使用GPU {device_id}")
    else:
        # 單GPU環境，使用原始模型
        ddp_model = model
        print("單GPU環境，使用原始模型（非DDP）")
    
    return ddp_model

# 演示DDP包裝
print("=== DDP 模型包裝演示 ===")
print()

# 在單GPU環境中的演示
ddp_model = wrap_model_for_ddp(model, device_id=0 if torch.cuda.is_available() else None)

print(f"\n原始模型類型: {type(model).__name__}")
print(f"DDP模型類型: {type(ddp_model).__name__}")

# 顯示DDP的關鍵特性
print("\n=== DDP 關鍵特性 ===")
print("1. 參數同步: DDP會自動同步所有GPU上的模型參數")
print("2. 梯度聚合: 使用All-Reduce算法聚合梯度")
print("3. 廣播: 在訓練開始時廣播模型參數")
print("4. 錯誤檢測: 檢測未使用的參數和梯度異常")

if hasattr(ddp_model, 'module'):
    print("\n注意: 在DDP模型中，原始模型可通過 .module 屬性訪問")
    print(f"ddp_model.module 類型: {type(ddp_model.module).__name__}")

## 7. 訓練配置

In [None]:
# 訓練配置
config = {
    # 模型配置
    'model_name': 'SimpleTransformer',
    'vocab_size': 8000,
    'd_model': 256,
    'nhead': 8,
    'num_layers': 4,
    'max_seq_len': 128,
    
    # 訓練配置
    'batch_size': 8,
    'learning_rate': 5e-4,
    'num_epochs': 3,
    'warmup_steps': 100,
    'weight_decay': 0.01,
    
    # DDP配置
    'backend': 'nccl' if torch.cuda.is_available() else 'gloo',
    'find_unused_parameters': False,
    'gradient_clipping': 1.0,
    
    # 數據配置
    'num_workers': 2,
    'pin_memory': True,
    
    # 日誌配置
    'log_interval': 50,
    'save_interval': 500,
    'eval_interval': 200,
}

print("=== 訓練配置 ===")
for category in ['模型配置', '訓練配置', 'DDP配置', '數據配置', '日誌配置']:
    print(f"\n{category}:")
    if category == '模型配置':
        keys = ['model_name', 'vocab_size', 'd_model', 'nhead', 'num_layers', 'max_seq_len']
    elif category == '訓練配置':
        keys = ['batch_size', 'learning_rate', 'num_epochs', 'warmup_steps', 'weight_decay']
    elif category == 'DDP配置':
        keys = ['backend', 'find_unused_parameters', 'gradient_clipping']
    elif category == '數據配置':
        keys = ['num_workers', 'pin_memory']
    else:
        keys = ['log_interval', 'save_interval', 'eval_interval']
    
    for key in keys:
        if key in config:
            print(f"  {key}: {config[key]}")

# 創建優化器
optimizer = torch.optim.AdamW(
    ddp_model.parameters(),
    lr=config['learning_rate'],
    weight_decay=config['weight_decay']
)

# 創建學習率調度器
from torch.optim.lr_scheduler import LinearLR, CosineAnnealingLR, SequentialLR

# Warmup + Cosine調度
warmup_scheduler = LinearLR(
    optimizer,
    start_factor=0.1,
    total_iters=config['warmup_steps']
)

cosine_scheduler = CosineAnnealingLR(
    optimizer,
    T_max=config['num_epochs'] * len(train_dataset) // config['batch_size'] - config['warmup_steps']
)

scheduler = SequentialLR(
    optimizer,
    schedulers=[warmup_scheduler, cosine_scheduler],
    milestones=[config['warmup_steps']]
)

print(f"\n優化器: {type(optimizer).__name__}")
print(f"調度器: Warmup + CosineAnnealing")
print(f"初始學習率: {config['learning_rate']}")

## 8. 多GPU訓練腳本範例

In [None]:
# 創建多GPU訓練腳本範例
multi_gpu_script = '''
#!/usr/bin/env python3
# train_ddp.py - 多GPU DDP訓練腳本

import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler
import argparse

def setup(rank, world_size):
    """初始化分散式訓練環境"""
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    
    # 初始化進程組
    dist.init_process_group("nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

def cleanup():
    """清理分散式訓練環境"""
    dist.destroy_process_group()

def train_ddp(rank, world_size, args):
    """DDP訓練主函數"""
    print(f"[Rank {rank}] 開始訓練")
    
    # 設置分散式環境
    setup(rank, world_size)
    
    # 創建模型
    model = YourModel().to(rank)
    ddp_model = DDP(model, device_ids=[rank])
    
    # 創建數據集和DataLoader
    dataset = YourDataset()
    sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank)
    dataloader = DataLoader(dataset, batch_size=args.batch_size, sampler=sampler)
    
    # 訓練循環
    for epoch in range(args.epochs):
        sampler.set_epoch(epoch)  # 確保每個epoch的數據分割不同
        
        for batch_idx, (data, target) in enumerate(dataloader):
            data, target = data.to(rank), target.to(rank)
            
            optimizer.zero_grad()
            output = ddp_model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            
            if rank == 0 and batch_idx % 100 == 0:
                print(f"Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item():.4f}")
    
    cleanup()

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--batch-size', type=int, default=32)
    args = parser.parse_args()
    
    world_size = torch.cuda.device_count()
    mp.spawn(train_ddp, args=(world_size, args), nprocs=world_size, join=True)

if __name__ == "__main__":
    main()
'''

# 顯示腳本
print("=== 多GPU DDP訓練腳本範例 ===")
print("以下是完整的多GPU DDP訓練腳本結構:")
print(multi_gpu_script)

print("\n=== 執行命令 ===")
print("# 使用 torchrun (推薦)")
print("torchrun --nproc_per_node=4 train_ddp.py --epochs 10 --batch-size 32")
print()
print("# 使用 mp.spawn")
print("python train_ddp.py --epochs 10 --batch-size 32")
print()
print("# 多節點訓練")
print("torchrun --nnodes=2 --nproc_per_node=4 --node_rank=0 \\")
print("         --master_addr=192.168.1.1 --master_port=29500 \\")
print("         train_ddp.py")

## 9. 總結

In [None]:
print("=== Lab-1.2 Setup 完成 ===")
print()
print("✅ 已完成:")
print("  1. 環境檢查和依賴驗證")
print("  2. DDP核心概念理解")
print("  3. 進程初始化函數定義")
print("  4. 示例模型創建")
print("  5. 數據集和DataLoader準備")
print("  6. DDP模型包裝演示")
print("  7. 訓練配置設定")
print("  8. 多GPU訓練腳本範例")
print()
print("📝 下一步:")
print("  - 02-Train.ipynb: 實際訓練過程")
print("  - 03-Optimization.ipynb: 通訊優化")
print("  - 04-Advanced.ipynb: 進階技術")
print()
print("💡 重要提醒:")
print("  - 當前為單GPU環境演示")
print("  - 多GPU環境中將看到真正的加速效果")
print("  - 重點理解DDP的設計理念和配置方法")

# 保存配置供下一個notebook使用
torch.save({
    'config': config,
    'model_state_dict': model.state_dict(),
    'vocab_size': 8000,
    'device': str(device)
}, 'ddp_setup.pth')

print("\n💾 配置已保存到 ddp_setup.pth")