In [1]:
import wandb
import torch
import time
import logging
# Configure logging

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler()  # Print to console
    ]
)


# 初始化wandb
run = wandb.init(
    project="debug_example",
    notes="调试训练流程",
    name="debug_run_" + time.strftime("%Y%m%d_%H%M%S")
)

# 记录初始化信息
run.summary['Setup'] = "模型和数据加载完成"
t = 0 
# 模拟训练循环
for epoch in range(5):
    # # 记录每个epoch开始的状态
    # wandb.log({
    #     "status_info": f"开始第{epoch+1}个epoch的训练",
    #     "epoch": epoch+1
    # })
    
    # 模拟训练过程
    for batch_idx in range(3):
        # # 记录批次处理信息
        # if batch_idx == 0:
        #     wandb.log({
        #         "batch_info": f"Epoch {epoch+1}: 处理第{batch_idx+1}个批次",
        #         "epoch": epoch+1,
        #         "batch": batch_idx+1
        #     })
        
        # 模拟计算loss等指标
        loss = 1.0 - 0.1 * epoch - 0.02 * batch_idx

        # 同时记录数值指标和文本信息
        wandb.log({
            "loss": loss,
            "epoch": epoch+1,
            "batch": batch_idx+1,
            "debug_msg": f"Batch {batch_idx+1} 处理完成，当前loss: {loss:.4f}"
        })

        # print(f"t: {t}, loss: {loss}, batch: {batch_idx+1}, epoch: {epoch+1}")
        logging.info(f"t: {t}, loss: {loss}, batch: {batch_idx+1}, epoch: {epoch+1}")
        t = t + 1
# 记录重要的调试信息到summary
run.summary['Debug summary'] = "训练完成，共5个epoch，最终loss约0.7"

# 记录模型架构等详细信息
model_info = """
SimpleModel(
  (fc1): Linear(in_features=10, out_features=5, bias=True)
  (fc2): Linear(in_features=5, out_features=1, bias=True)
  (activation): ReLU()
)
"""
wandb.log({
    "model_details": wandb.Table(
        columns=["Model Architecture"], 
        data=[[model_info]]
    )
})

wandb.finish()


[34m[1mwandb[0m: Currently logged in as: [33m15652388600[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


2025-07-07 20:56:40,123 - INFO - t: 0, loss: 1.0, batch: 1, epoch: 1
2025-07-07 20:56:40,125 - INFO - t: 1, loss: 0.98, batch: 2, epoch: 1
2025-07-07 20:56:40,126 - INFO - t: 2, loss: 0.96, batch: 3, epoch: 1
2025-07-07 20:56:40,127 - INFO - t: 3, loss: 0.9, batch: 1, epoch: 2
2025-07-07 20:56:40,128 - INFO - t: 4, loss: 0.88, batch: 2, epoch: 2
2025-07-07 20:56:40,129 - INFO - t: 5, loss: 0.86, batch: 3, epoch: 2
2025-07-07 20:56:40,129 - INFO - t: 6, loss: 0.8, batch: 1, epoch: 3
2025-07-07 20:56:40,130 - INFO - t: 7, loss: 0.78, batch: 2, epoch: 3
2025-07-07 20:56:40,131 - INFO - t: 8, loss: 0.76, batch: 3, epoch: 3
2025-07-07 20:56:40,132 - INFO - t: 9, loss: 0.7, batch: 1, epoch: 4
2025-07-07 20:56:40,133 - INFO - t: 10, loss: 0.6799999999999999, batch: 2, epoch: 4
2025-07-07 20:56:40,134 - INFO - t: 11, loss: 0.6599999999999999, batch: 3, epoch: 4
2025-07-07 20:56:40,134 - INFO - t: 12, loss: 0.6, batch: 1, epoch: 5
2025-07-07 20:56:40,135 - INFO - t: 13, loss: 0.58, batch: 2, ep

0,1
batch,▁▅█▁▅█▁▅█▁▅█▁▅█
epoch,▁▁▁▃▃▃▅▅▅▆▆▆███
loss,██▇▆▆▆▅▄▄▃▃▃▂▁▁

0,1
Debug summary,训练完成，共5个epoch，最终loss...
Setup,模型和数据加载完成
batch,3
debug_msg,Batch 3 处理完成，当前loss:...
epoch,5
loss,0.56
