In [1]:
import os
import yaml
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.amp import GradScaler, autocast
from tqdm import tqdm
from metrics.dataloader import RainDataset, get_transform, compute_mean_std

# Import modules
from models.archs.DPENet_v1 import DPENet
from models.archs.DPENet_v3 import DPENet_v3
from models.archs.losses import SSIMLoss_v2, EdgeLoss_v2, L1Loss
from models.CosineAnnealingRestartCyclicLR import CosineAnnealingRestartCyclicLR

In [2]:
def load_config(file_path):
    with open(file_path, 'r') as file:
        config = yaml.safe_load(file)
    
    # Ensure specific types
    config["epochs"] = int(config["epochs"])
    config["batch_size"] = int(config["batch_size"])
    config["lr"] = float(config["lr"])
    config["eta_min"] = float(config["eta_min"])
    config["periods"] = [int(period) for period in config["periods"]]
    config["restart_weights"] = [float(weight) for weight in config["restart_weights"]]
    config["num_workers"] = int(config["num_workers"])
    config["log_interval"] = int(config["log_interval"])
    config["use_amp"] = bool(config["use_amp"])
    config["grad_clip"] = float(config["grad_clip"])  # 新增梯度裁剪閾值，預設為 1.0
    
    return config

def denormalize(tensor, mean, std, device='cpu'):
    """
    反歸一化 Tensor 將 Normalize(mean, std) 轉回原始範圍
    """
    mean = torch.tensor(mean).view(1, -1, 1, 1).to(device)  # 調整 shape 以匹配輸入
    std = torch.tensor(std).view(1, -1, 1, 1).to(device)
    return tensor * std + mean  # 反歸一化

In [None]:
if __name__ == "__main__":
    # Load configuration from YAML file
    config = load_config('config.yml')
        
    # 創建存放模型的資料夾
    os.makedirs(config["checkpoint_dir"], exist_ok=True)

    # 創建模型
    model = DPENet()
    model.to(config["device"])

    # 創建損失函數
    ssim_loss = SSIMLoss_v2().to(config["device"])
    edge_loss = EdgeLoss_v2().to(config["device"])
    l1_loss = L1Loss(loss_weight=1.0, reduction='mean').to(config["device"])

    # 創建優化器 & 學習率調整器
    optimizer = optim.AdamW(model.parameters(), lr=config["lr"], 
                            betas=(0.9 , 0.999), weight_decay=0)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30, 50, 80], gamma=0.2)

    """
    scheduler = CosineAnnealingRestartCyclicLR(optimizer, periods=config["periods"], 
                                            restart_weights=config["restart_weights"], 
                                            eta_mins=[config["lr"], config["eta_min"]])
    """
    """
    # 1️⃣ 先計算 mean/std（現在可以直接指定 `mode` 和 `dataset_name`）
    input_mean, input_std, target_mean, target_std = compute_mean_std(
        mode='train', dataset_name=config["dataset_name"], device=config["device"])
    """
    input_mean = [0.5110453963279724, 0.5104997158050537, 0.4877311885356903]
    input_std = [0.23112213611602783, 0.23167330026626587, 0.23953330516815186]

    target_mean = [0.43193507194519043, 0.43070125579833984, 0.4052175283432007]
    target_std = [0.24484442174434662, 0.2445715367794037, 0.25179967284202576]
    
    # 2️⃣ 創建數據集，並直接應用計算好的 `mean/std`
    #train_dataset = RainDataset(mode='train', dataset_name=config["dataset_name"], transform=get_transform(input_mean, input_std, train=True))
    train_dataset = RainDataset(mode='train', dataset_name=config["dataset_name"], 
                            transform=get_transform(train=True))
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config["batch_size"], 
                                            shuffle=True, num_workers=4)

    # 設定 AMP（混合精度）
    scaler = GradScaler('cuda', enabled=config["use_amp"])
    # 訓練迴圈
    print(f"開始訓練 DPENet_CFIM，使用設備：{config['device']}")
    
    break_flag = False
    for epoch in range(config["epochs"]):
        if break_flag: break
        
        model.train()
        running_loss = 0.0
        pbar = tqdm(train_loader, desc=f"Epoch [{epoch+1}/{config['epochs']}]")

        for batch_idx, (input_img, target_img) in enumerate(pbar):
            # print("輸入最大值:", input_img.max().item(), "最小值:", input_img.min().item())
            # print("目標最大值:", target_img.max().item(), "最小值:", target_img.min().item())
            input_img, target_img = input_img.to(config["device"]), target_img.to(config["device"])

            optimizer.zero_grad()

            # 前向傳播（AMP 混合精度）
            with autocast(device_type='cuda', enabled=config["use_amp"]):
                mid_output, output = model(input_img) #, break_flag用於調試
                #print("輸出最大值:", output.max().item(), "最小值:", output.min().item())
                #print("中間輸出最大值:", mid_output.max().item(), "最小值:", mid_output.min().item())
                # if break_flag: break
                
                # 反歸一化 output 和 target
                output_denorm = denormalize(output, input_mean, input_std, device=config["device"]).clamp(0, 1)
                mid_output_denorm = denormalize(mid_output, input_mean, input_std, device=config["device"]).clamp(0, 1)
                target_denorm = denormalize(target_img, target_mean, target_std, device=config["device"]).clamp(0, 1)
                # 確保數據範圍
                #print(f"📊 反歸一化後 output 最大值: {output_denorm.max().item()}, 最小值: {output_denorm.min().item()}")
                #print(f"📊 反歸一化後 target 最大值: {target_denorm.max().item()}, 最小值: {target_denorm.min().item()}")

                ssim_val = ssim_loss(output, target_img) + ssim_loss(mid_output, target_img)
                edge_val = edge_loss(output, target_img) + edge_loss(mid_output, target_img)
                
                if torch.isnan(ssim_val) or torch.isinf(ssim_val):
                    print("⚠ SSIM Loss 出錯", ssim_val, input_img.sum(), target_img.sum())

                if torch.isnan(edge_val) or torch.isinf(edge_val):
                    print("⚠ Edge Loss 出錯", edge_val, input_img.sum(), target_img.sum())

                loss = ssim_val + 0.05 * edge_val

            # 反向傳播
            scaler.scale(loss).backward()
            
            # 梯度裁剪，防止梯度爆炸或震盪
            torch.nn.utils.clip_grad_norm_(model.parameters(), config["grad_clip"])
            
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()  # 更新學習率

            # 記錄 Loss
            running_loss += loss.item()
            if batch_idx % config["log_interval"] == 0:
                pbar.set_postfix(loss=loss.item())

        avg_loss = running_loss / len(train_loader)
        #print(f"🔹 Epoch [{epoch+1}/{config['epochs']}], Loss: {avg_loss:.6f}, LR: {scheduler.get_lr()[0]:.8f}")
        print(f"🔹 Epoch [{epoch+1}/{config['epochs']}], Loss: {avg_loss:.6f}")

        # 保存模型（每 10 個 Epoch 保存一次）
        if (epoch + 1) % 10 == 0:
            save_path = os.path.join(config["checkpoint_dir"], f"DPENet_w_mid_epoch{epoch+1}.pth")
            torch.save(model.state_dict(), save_path)
            print(f"模型已保存：{save_path}")

開始訓練 DPENet_CFIM，使用設備：cuda


  with torch.cuda.amp.autocast():
Epoch [1/200]:  54%|█████▍    | 463/857 [04:11<03:34,  1.84it/s, loss=0.145] 


KeyboardInterrupt: 