In [1]:
import os
import yaml
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.amp import GradScaler, autocast
from tqdm import tqdm
from metrics.dataloader_trace import RainDataset, get_transform, compute_mean_std, save_trace_dict_to_csv

# Import modules
from models.archs.DPENet_v1 import DPENet
from models.archs.DPENet_v3 import DPENet_v3
from models.archs.Network_test import DPENet_Traceable
from models.archs.losses import SSIMLoss_v2, EdgeLoss_v2, L1Loss
from models.CosineAnnealingRestartCyclicLR import CosineAnnealingRestartCyclicLR

import numpy as np
import matplotlib.pyplot as plt
from torchvision.transforms.functional import to_pil_image

def load_config(file_path):
    with open(file_path, 'r') as file:
        config = yaml.safe_load(file)
    
    # Ensure specific types
    config["epochs"] = int(config["epochs"])
    config["batch_size"] = int(config["batch_size"])
    config["lr"] = float(config["lr"])
    config["eta_min"] = float(config["eta_min"])
    config["periods"] = [int(period) for period in config["periods"]]
    config["restart_weights"] = [float(weight) for weight in config["restart_weights"]]
    config["num_workers"] = int(config["num_workers"])
    config["log_interval"] = int(config["log_interval"])
    config["use_amp"] = bool(config["use_amp"])
    config["grad_clip"] = float(config["grad_clip"])  # 新增梯度裁剪閾值，預設為 1.0
    config["save_trace"] = bool(config["save_trace"])
    
    return config



In [2]:
if __name__ == "__main__":
    # Load configuration from YAML file
    config = load_config('config.yml')
        
    # 創建存放模型的資料夾
    os.makedirs(config["checkpoint_dir"], exist_ok=True)

    # 創建模型
    model = DPENet_Traceable()
    model.to(config["device"])

    # 創建損失函數
    ssim_loss = SSIMLoss_v2().to(config["device"])
    edge_loss = EdgeLoss_v2().to(config["device"])
    l1_loss = L1Loss(loss_weight=1.0, reduction='mean').to(config["device"])

    # 創建優化器 & 學習率調整器
    optimizer = optim.Adam(model.parameters(), lr=config["lr"])
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30, 50, 80], gamma=0.2)

    train_dataset = RainDataset(mode='train', dataset_name=config["dataset_name"], transform=get_transform(train=True))
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config["batch_size"], 
                                            shuffle=True, drop_last=True, num_workers=4, pin_memory=True)

    # 設定 AMP（混合精度）
    # scaler = GradScaler('cuda', enabled=config["use_amp"])
    # 訓練迴圈
    print(f"開始訓練 DPENet_CFIM，使用設備：{config['device']}")
    
    for epoch in range(config["epochs"]):
        
        model.train()
        running_loss = 0.0
        running_ssim = 0.0
        pbar = tqdm(train_loader, desc=f"Epoch [{epoch+1}/{config['epochs']}]")

        for batch_idx, (input_img, target_img) in enumerate(pbar):
            input_img, target_img = input_img.to(config["device"]), target_img.to(config["device"])

            optimizer.zero_grad()

            mid_output , trace_features = model(input_img, trace=True) #, break_flag用於調試

            ssim_val = ssim_loss(mid_output, target_img)
            edge_val = edge_loss(mid_output, target_img)
            
            if torch.isnan(ssim_val) or torch.isinf(ssim_val):
                print("⚠ SSIM Loss 出錯", ssim_val)

            if torch.isnan(edge_val) or torch.isinf(edge_val):
                print("⚠ Edge Loss 出錯", edge_val)

            loss = 1 - ssim_val + 0.05 * edge_val + 0.2
            """with autocast"""
            # 反向傳播
            loss.backward()
            
            # 梯度裁剪，防止梯度爆炸或震盪
            torch.nn.utils.clip_grad_norm_(model.parameters(), config["grad_clip"])
            optimizer.step()

            # 記錄 Loss
            running_loss += loss.item()
            running_ssim += ssim_val.item()
            if batch_idx % config["log_interval"] == 0:
                pbar.set_postfix(loss=loss.item())
        # ✅ 儲存中間層特徵（如需）
            if config.get("save_trace", False):  # 修改：新增選項控制 trace 資料儲存
                print("🧪 啟動 trace 資料儲存功能！")
                trace_dir = os.path.join(config["trace_output_dir"], f"epoch{epoch+1}_batch{batch_idx}")
                os.makedirs(trace_dir, exist_ok=True)
                save_trace_dict_to_csv(trace_features, prefix_dir=trace_dir)  # ✅ 修改這一行！

        avg_loss = running_loss / len(train_loader)
        avg_ssim = running_ssim / len(train_loader)

        print(f"🔹 Epoch [{epoch+1}/{config['epochs']}], Loss: {avg_loss:.6f}, SSIM: {avg_ssim:.6f}")
        scheduler.step()  # 更新學習率



開始訓練 DPENet_CFIM，使用設備：cuda


Epoch [1/1]: 100%|██████████| 1/1 [00:00<00:00,  5.12it/s, loss=0.679]

🧪 啟動 trace 資料儲存功能！
📁 正在儲存: input, shape: (1, 3, 16, 16), ndim: 4
📁 正在儲存: experts_layer0_output, shape: (1, 3, 16, 16), ndim: 4
📁 正在儲存: experts_layer1_output, shape: (1, 3, 16, 16), ndim: 4
📁 正在儲存: experts_layer2_output, shape: (1, 3, 16, 16), ndim: 4
📁 正在儲存: experts_layer0_weight, shape: (3, 3, 3, 3), ndim: 4
📁 正在儲存: experts_layer1_weight, shape: (3, 3, 3, 3), ndim: 4
📁 正在儲存: experts_layer2_weight, shape: (3, 3, 3, 3), ndim: 4
📁 正在儲存: conv1_layer0_output, shape: (1, 3, 16, 16), ndim: 4
📁 正在儲存: conv1_layer0_weight, shape: (3, 9, 1, 1), ndim: 4
📁 正在儲存: conv1_layer0_bias, shape: (3,), ndim: 1
📁 正在儲存: pa_layer0_output, shape: (1, 1, 16, 16), ndim: 4
📁 正在儲存: pa_layer0_weight, shape: (1, 3, 3, 3), ndim: 4
📁 正在儲存: pa_layer1_output, shape: (1, 1, 16, 16), ndim: 4
📁 正在儲存: pa_layer2_output, shape: (1, 3, 16, 16), ndim: 4
📁 正在儲存: pa_layer2_weight, shape: (3, 1, 3, 3), ndim: 4
📁 正在儲存: pa_layer2_bias, shape: (3,), ndim: 1
📁 正在儲存: pa_layer3_output, shape: (1, 3, 16, 16), ndim: 4
📁 正在儲存: output, shap


