In [2]:
# 1. 环境和依赖检查
import torch
import numpy as np
import os
import yaml

print("[INFO] Torch version:", torch.__version__)
print("[INFO] CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("[INFO] CUDA device:", torch.cuda.get_device_name(0))
else:
    print("[INFO] CUDA device not found, using CPU.")

# 安装 tqdm/scipy（如未安装时取消注释）
# !pip install tqdm scipy

[INFO] Torch version: 2.5.1+cu121
[INFO] CUDA available: True
[INFO] CUDA device: NVIDIA GeForce RTX 4080 Laptop GPU


In [3]:
# 2. 配置、模型和工具模块导入
from data.dataset import get_dataloaders
from models.mlp import MLPRegressor
from utils.metrics import mse_loss
import tqdm

In [4]:
# 3. 加载配置
cfg_path = "configs/mlp.yaml"
with open(cfg_path) as f:
    cfg = yaml.safe_load(f)
print("配置内容：", cfg)


配置内容： {'seed': 42, 'epochs': 100, 'batch_size': 64, 'lr': 0.001, 'hidden_dims': [150, 150, 150], 'activation': 'relu', 'train_ratio': 0.8, 'early_stop_patience': 10}


In [5]:
# 4. 设置随机种子与训练参数
def set_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
set_seed(cfg['seed'])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)


Using device: cuda


In [6]:
# 5. 数据加载
scaler_path = "checkpoints/mlp_minmaxscaler.pkl"
train_loader, val_loader = get_dataloaders(
    mat_path='D:/AiProjects/UCLmaster/dlfitting_verdict/VERDICT_training/AS_Z_fixdv/TrainingSet.mat',
    batch_size=cfg['batch_size'],
    train_ratio=cfg['train_ratio'],
    seed=cfg['seed'],
    scaler_path=scaler_path
)
print("[INFO] 训练/验证数据已准备好。")


[INFO] 训练/验证数据已准备好。


In [7]:
# 6. 初始化模型和优化器
X_sample, y_sample = next(iter(train_loader))
input_dim = X_sample.shape[1]
output_dim = y_sample.shape[1]
model = MLPRegressor(input_dim, output_dim, cfg['hidden_dims'], cfg['activation']).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=cfg['lr'])
criterion = torch.nn.MSELoss()
print("[INFO] 模型初始化完成。")


[INFO] 模型初始化完成。


In [8]:
# 7. 日志与权重文件准备
log_dir = 'logs'
ckpt_dir = 'checkpoints'
os.makedirs(log_dir, exist_ok=True)
os.makedirs(ckpt_dir, exist_ok=True)
log_path = os.path.join(log_dir, 'mlp_log.txt')
ckpt_path = os.path.join(ckpt_dir, 'mlp_best.pt')

if not os.path.exists(log_path):
    with open(log_path, 'w') as f:
        f.write("Epoch\tTrainLoss\tValLoss\n")


In [9]:
# 8. 训练主循环（带tqdm进度条）
from tqdm.notebook import tqdm

epochs = cfg['epochs']
best_loss = float('inf')
patience = 0

for epoch in tqdm(range(epochs), desc="Epoch"):
    model.train()
    total_loss = 0
    for X, y in tqdm(train_loader, desc=f"Train {epoch+1}", leave=False):
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        pred = model(X)
        loss = criterion(pred, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * X.size(0)
    avg_train_loss = total_loss / len(train_loader.dataset)

    # 验证
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for X, y in val_loader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            loss = criterion(pred, y)
            total_val_loss += loss.item() * X.size(0)
    avg_val_loss = total_val_loss / len(val_loader.dataset)

    # tqdm.write会在进度条下方输出log，方便notebook观察
    tqdm.write(f"Epoch {epoch+1}: Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    # 追加日志
    with open(log_path, 'a') as f:
        f.write(f"{epoch+1}\t{avg_train_loss:.6f}\t{avg_val_loss:.6f}\n")

    # Early stopping & best权重
    if avg_val_loss < best_loss:
        best_loss = avg_val_loss
        patience = 0
        torch.save(model.state_dict(), ckpt_path)
        tqdm.write(f"[INFO] Best model saved at: {ckpt_path}")
    else:
        patience += 1
        if patience >= cfg['early_stop_patience']:
            tqdm.write("[INFO] Early stopping triggered.")
            break

print("训练完成！")


ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html