In [2]:
import os
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchvision import datasets, transforms, models
from torch.cuda.amp import GradScaler, autocast
from tqdm import tqdm
import numpy as np

- 분류(Classification) 기본 훈련 루프 + TensorBoard 로깅

In [3]:
import torchvision
import torch
print(torchvision.__version__)
print(torch.__version__)
print(torch.version.cuda)

0.12.0
1.11.0
11.3


In [None]:
import os
import time
import glob
import xml.etree.ElementTree as ET
from collections import Counter

from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import functional as F
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
# =====================
# CONFIG (edit here)
# =====================
DATA_DIR = r"D:\AI_SVT_Training_mk\annotations\annos"   # jpg/png + VOC xml가 함께 있는 폴더
LOG_BASE = r"D:\AI_SVT_Training_mk\train_result\pytorch_det"  # TensorBoard 로그 + 체크포인트 출력 기본 폴더
RUN_NAME = time.strftime("run_%Y%m%d_%H%M%S")
LOGDIR = os.path.join(LOG_BASE, RUN_NAME)
CKPT_DIR = os.path.join(LOGDIR, "checkpoints")

EPOCHS = 20
BATCH_SIZE = 4                 # GPU 메모리에 맞게 조절
LR = 1e-4
NUM_WORKERS = 4
VAL_SPLIT = 0.1                # 10% 검증
MIN_BOX_SIZE = 1               # 너무 작은 박스 제거(px)
SAVE_BEST_ONLY = True
SEED = 42

# =====================================
# Utility: set seed
# =====================================
def set_seed(seed: int = 42):
    import random
    import numpy as np
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(SEED)

os.makedirs(LOGDIR, exist_ok=True)
os.makedirs(CKPT_DIR, exist_ok=True)

# =====================================
# 1) Scan XMLs and build class map
# =====================================
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp"}

def find_pairs(data_dir):
    xmls = sorted(glob.glob(os.path.join(data_dir, "*.xml")))
    pairs = []
    for xml in xmls:
        stem = os.path.splitext(os.path.basename(xml))[0]
        img_path = None
        for ext in IMG_EXTS:
            cand = os.path.join(data_dir, stem + ext)
            if os.path.exists(cand):
                img_path = cand
                break
        if img_path is not None:
            pairs.append((img_path, xml))
    return pairs

pairs = find_pairs(DATA_DIR)
if len(pairs) == 0:
    raise FileNotFoundError(f"No (image, xml) pairs in: {DATA_DIR}")

# gather class names
class_names = []
for _, xml in pairs:
    try:
        root = ET.parse(xml).getroot()
    except Exception:
        continue
    for obj in root.findall("object"):
        name = obj.findtext("name")
        if name:
            class_names.append(name.strip())

if not class_names:
    raise RuntimeError("No object classes found in XMLs.")

# unique + stable ordering
unique_classes = sorted(set(class_names))
# background=0 rule (torchvision detection)
class_to_idx = {c: i + 1 for i, c in enumerate(unique_classes)}
idx_to_class = {i + 1: c for i, c in enumerate(unique_classes)}

# save classes
with open(os.path.join(LOGDIR, "classes.txt"), "w", encoding="utf-8") as f:
    for c in unique_classes:
        f.write(f"{c}\n")

print("[INFO] Classes:", unique_classes)
print("[INFO] #Images:", len(pairs))

# =====================================
# 2) Dataset
# =====================================
class VOCDataset(Dataset):
    def __init__(self, pairs, transforms=None):
        self.pairs = pairs
        self.transforms = transforms

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        img_path, xml_path = self.pairs[idx]
        img = Image.open(img_path).convert("RGB")
        w, h = img.size

        boxes = []
        labels = []
        iscrowd = []

        root = ET.parse(xml_path).getroot()
        for obj in root.findall("object"):
            name = obj.findtext("name").strip()
            if name not in class_to_idx:
                # unseen class -> skip
                continue
            bnd = obj.find("bndbox")
            if bnd is None:
                continue
            try:
                xmin = float(bnd.findtext("xmin"))
                ymin = float(bnd.findtext("ymin"))
                xmax = float(bnd.findtext("xmax"))
                ymax = float(bnd.findtext("ymax"))
            except Exception:
                continue

            # clamp
            xmin = max(0, min(xmin, w - 1))
            ymin = max(0, min(ymin, h - 1))
            xmax = max(0, min(xmax, w - 1))
            ymax = max(0, min(ymax, h - 1))

            if xmax - xmin < MIN_BOX_SIZE or ymax - ymin < MIN_BOX_SIZE:
                continue

            boxes.append([xmin, ymin, xmax, ymax])
            labels.append(class_to_idx[name])

            diff = obj.findtext("difficult")
            iscrowd.append(1 if (diff and diff.strip() == "1") else 0)

        if len(boxes) == 0:
            # torchvision detection은 타겟 비어있으면 학습에 문제 -> 샘플 건너뛸 수 없으니 최소 더미는 허용 X.
            # 대신, 빈 샘플이면 작은 더미를 넣지 말고 Exception으로 처리해서 DataLoader에서 재시도하도록 함
            raise ValueError("Empty target (no boxes)")

        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64)
        iscrowd = torch.tensor(iscrowd, dtype=torch.int64)

        target = {
            "boxes": boxes,
            "labels": labels,
            "iscrowd": iscrowd,
            "image_id": torch.tensor([idx], dtype=torch.int64),
        }

        # to tensor [0,1]
        img_t = F.pil_to_tensor(img).float() / 255.0

        if self.transforms:
            img_t = self.transforms(img_t)

        return img_t, target


def collate_fn(batch):
    # skip samples that raised ValueError (Empty target)
    batch = [b for b in batch if b is not None]
    imgs, tgts = list(zip(*batch))
    return list(imgs), list(tgts)

# =====================================
# 3) Split train/val
# =====================================
from sklearn.model_selection import train_test_split
train_pairs, val_pairs = train_test_split(pairs, test_size=VAL_SPLIT, random_state=SEED, shuffle=True)

train_ds = VOCDataset(train_pairs)
val_ds   = VOCDataset(val_pairs)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=NUM_WORKERS, pin_memory=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False,
                          num_workers=NUM_WORKERS, pin_memory=True, collate_fn=collate_fn)

# =====================================
# 4) Model / Optim
# =====================================
num_classes = 1 + len(unique_classes)  # background + K
model = fasterrcnn_resnet50_fpn(weights="DEFAULT", num_classes=num_classes)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.AdamW(params, lr=LR)

# =====================================
# 5) TensorBoard writer
# =====================================
writer = SummaryWriter(LOGDIR)
with open(os.path.join(LOGDIR, "readme.txt"), "w", encoding="utf-8") as f:
    f.write(f"DATA_DIR={DATA_DIR}\n")
    f.write(f"classes={unique_classes}\n")

# =====================================
# 6) Train/Eval loops
# =====================================

def evaluate_loss(model, loader, max_iters=None):
    model.eval()
    total = 0.0
    count = 0
    with torch.no_grad():
        pbar = tqdm(loader, desc="Val", leave=False)
        for i, (images, targets) in enumerate(pbar):
            images = [im.to(device) for im in images]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            loss_dict = model(images, targets)
            loss = sum(v for v in loss_dict.values())
            total += loss.item()
            count += 1
            if max_iters and i + 1 >= max_iters:
                break
    return total / max(1, count)

best_val = float("inf")
step = 0

for epoch in range(1, EPOCHS + 1):
    model.train()
    pbar = tqdm(train_loader, desc=f"Train {epoch}/{EPOCHS}")
    for images, targets in pbar:
        images = [im.to(device) for im in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        loss = sum(v for v in loss_dict.values())

        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        # TB per-step
        writer.add_scalar("train/total_loss", loss.item(), step)
        for k, v in loss_dict.items():
            writer.add_scalar(f"train/{k}", v.item(), step)
        step += 1

        pbar.set_postfix(total_loss=f"{loss.item():.4f}")

    # Validation
    val_loss = evaluate_loss(model, val_loader)
    writer.add_scalar("val/total_loss", val_loss, epoch)

    # Save
    ckpt_path = os.path.join(CKPT_DIR, f"epoch{epoch:03d}_valloss{val_loss:.4f}.pt")
    torch.save({
        "epoch": epoch,
        "model": model.state_dict(),
        "optimizer": optimizer.state_dict(),
        "classes": unique_classes,
        "val_loss": val_loss,
    }, ckpt_path)

    if SAVE_BEST_ONLY:
        if val_loss < best_val:
            best_val = val_loss
            torch.save(model.state_dict(), os.path.join(CKPT_DIR, "best.pt"))
    else:
        torch.save(model.state_dict(), os.path.join(CKPT_DIR, "last.pt"))

# 훈련 종료 마커 (GUI가 감지하도록 텍스트 로그 남김)
writer.add_text("status/final", f"done@{time.strftime('%Y-%m-%d %H:%M:%S')}")
writer.close()

print(f"[DONE] Logs: {LOGDIR}")
print(f"[TIP] tensorboard --logdir={LOG_BASE} --port=6007")


[INFO] Classes: ['1']
[INFO] #Images: 216


TypeError: FasterRCNN.__init__() got an unexpected keyword argument 'weights'

In [12]:
import os
import time
import glob
import xml.etree.ElementTree as ET
from typing import List, Tuple

from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import functional as F
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

# =====================
# CONFIG (edit here)
# =====================
DATA_DIR = r"D:\AI_SVT_Training_mk\annotations\annos"   # jpg/png + VOC xml가 함께 있는 폴더
LOG_BASE = r"D:\AI_SVT_Training_mk\train_result\pytorch_det"  # TensorBoard 로그 + 체크포인트 출력 기본 폴더
RUN_NAME = time.strftime("run_%Y%m%d_%H%M%S")
LOGDIR = os.path.join(LOG_BASE, RUN_NAME)
CKPT_DIR = os.path.join(LOGDIR, "checkpoints")

EPOCHS = 20
BATCH_SIZE = 2                 # GPU 메모리에 맞게 조절
LR = 1e-4
NUM_WORKERS = 4
VAL_SPLIT = 0.1                # 10% 검증
MIN_BOX_SIZE = 1               # 너무 작은 박스 제거(px)
SAVE_BEST_ONLY = True
SEED = 42

# =====================================
# Utility: set seed
# =====================================
def set_seed(seed: int = 42):
    import random
    import numpy as np
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(SEED)

os.makedirs(LOGDIR, exist_ok=True)
os.makedirs(CKPT_DIR, exist_ok=True)

IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp"}

def find_pairs(data_dir: str) -> List[Tuple[str, str]]:
    xmls = sorted(glob.glob(os.path.join(data_dir, "*.xml")))
    pairs = []
    for xml in xmls:
        stem = os.path.splitext(os.path.basename(xml))[0]
        img_path = None
        for ext in IMG_EXTS:
            cand = os.path.join(data_dir, stem + ext)
            if os.path.exists(cand):
                img_path = cand
                break
        if img_path is not None:
            pairs.append((img_path, xml))
    return pairs


def xml_has_valid_object(xml_path: str) -> bool:
    try:
        root = ET.parse(xml_path).getroot()
    except Exception:
        return False
    for obj in root.findall("object"):
        bnd = obj.find("bndbox")
        if bnd is None:
            continue
        try:
            xmin = float(bnd.findtext("xmin"))
            ymin = float(bnd.findtext("ymin"))
            xmax = float(bnd.findtext("xmax"))
            ymax = float(bnd.findtext("ymax"))
        except Exception:
            continue
        if (xmax - xmin) >= MIN_BOX_SIZE and (ymax - ymin) >= MIN_BOX_SIZE:
            return True
    return False

pairs_all = find_pairs(DATA_DIR)
# 학습 중 빈 타겟 샘플로 인한 크래시를 방지하기 위해, 미리 필터링
pairs = [(img, xml) for (img, xml) in pairs_all if xml_has_valid_object(xml)]
if len(pairs) == 0:
    raise FileNotFoundError(f"No valid (image, xml) pairs with at least one box in: {DATA_DIR}")

# gather class names from filtered pairs만 사용
class_names = []
for _, xml in pairs:
    root = ET.parse(xml).getroot()
    for obj in root.findall("object"):
        name = obj.findtext("name")
        if name:
            class_names.append(name.strip())

unique_classes = sorted(set(class_names))
if not unique_classes:
    raise RuntimeError("No object classes found in XMLs after filtering.")

# background=0 rule (torchvision detection)
class_to_idx = {c: i + 1 for i, c in enumerate(unique_classes)}
idx_to_class = {i + 1: c for i, c in enumerate(unique_classes)}

with open(os.path.join(LOGDIR, "classes.txt"), "w", encoding="utf-8") as f:
    for c in unique_classes:
        f.write(f"{c}\n")

print("[INFO] Classes:", unique_classes)
print("[INFO] #Images(valid):", len(pairs), f"/ total {len(pairs_all)}")

# =====================================
# Dataset
# =====================================
class VOCDataset(Dataset):
    def __init__(self, pairs, transforms=None):
        self.pairs = pairs
        self.transforms = transforms

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        img_path, xml_path = self.pairs[idx]
        img = Image.open(img_path).convert("RGB")
        w, h = img.size

        boxes = []
        labels = []
        iscrowd = []

        root = ET.parse(xml_path).getroot()
        for obj in root.findall("object"):
            name = obj.findtext("name").strip()
            if name not in class_to_idx:
                continue
            bnd = obj.find("bndbox")
            if bnd is None:
                continue
            xmin = float(bnd.findtext("xmin"))
            ymin = float(bnd.findtext("ymin"))
            xmax = float(bnd.findtext("xmax"))
            ymax = float(bnd.findtext("ymax"))

            # clamp
            xmin = max(0, min(xmin, w - 1))
            ymin = max(0, min(ymin, h - 1))
            xmax = max(0, min(xmax, w - 1))
            ymax = max(0, min(ymax, h - 1))

            if xmax - xmin < MIN_BOX_SIZE or ymax - ymin < MIN_BOX_SIZE:
                continue

            boxes.append([xmin, ymin, xmax, ymax])
            labels.append(class_to_idx[name])

            diff = obj.findtext("difficult")
            iscrowd.append(1 if (diff and diff.strip() == "1") else 0)

        # 0개 박스 샘플은 미리 필터링했으므로 여기선 반드시 >=1
        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64)
        iscrowd = torch.tensor(iscrowd, dtype=torch.int64)

        target = {
            "boxes": boxes,
            "labels": labels,
            "iscrowd": iscrowd,
            "image_id": torch.tensor([idx], dtype=torch.int64),
        }

        img_t = F.pil_to_tensor(img).float() / 255.0
        if self.transforms:
            img_t = self.transforms(img_t)
        return img_t, target


def collate_fn(batch):
    imgs, tgts = list(zip(*batch))
    return list(imgs), list(tgts)

# =====================================
# Split train/val
# =====================================
from sklearn.model_selection import train_test_split
train_pairs, val_pairs = train_test_split(pairs, test_size=VAL_SPLIT, random_state=SEED, shuffle=True)

train_ds = VOCDataset(train_pairs)
val_ds   = VOCDataset(val_pairs)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=NUM_WORKERS, pin_memory=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False,
                          num_workers=NUM_WORKERS, pin_memory=True, collate_fn=collate_fn)

# =====================================
# Model / Optim (LEGACY API for torchvision 0.12)
# =====================================
num_classes = 1 + len(unique_classes)  # background + K

# 구버전: weights 인자 없음. COCO 사전학습 가중치 로드 → head 교체
model = fasterrcnn_resnet50_fpn(pretrained=True)  # backbone+RPN+head가 COCO용
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)  # 새 head로 교체

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.AdamW(params, lr=LR)

# =====================================
# TensorBoard writer
# =====================================
writer = SummaryWriter(LOGDIR)
with open(os.path.join(LOGDIR, "readme.txt"), "w", encoding="utf-8") as f:
    f.write(f"DATA_DIR={DATA_DIR}\n")
    f.write(f"classes={unique_classes}\n")
    f.write("legacy api: torchvision 0.12 / torch 1.11\n")

# =====================================
# Train/Eval loops
# =====================================

def evaluate_loss(model, loader, max_iters=None):
    model.eval()
    total = 0.0
    count = 0
    with torch.no_grad():
        pbar = tqdm(loader, desc="Val", leave=False)
        for i, (images, targets) in enumerate(pbar):
            images = [im.to(device) for im in images]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            loss_dict = model(images, targets)
            loss = sum(v for v in loss_dict.values())
            total += loss.item()
            count += 1
            if max_iters and i + 1 >= max_iters:
                break
    return total / max(1, count)

best_val = float("inf")
step = 0

for epoch in range(1, EPOCHS + 1):
    model.train()
    pbar = tqdm(train_loader, desc=f"Train {epoch}/{EPOCHS}")
    for images, targets in pbar:
        images = [im.to(device) for im in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        loss = sum(v for v in loss_dict.values())

        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        # TB per-step
        writer.add_scalar("train/total_loss", loss.item(), step)
        for k, v in loss_dict.items():
            writer.add_scalar(f"train/{k}", v.item(), step)
        step += 1

        pbar.set_postfix(total_loss=f"{loss.item():.4f}")

    # Validation
    val_loss = evaluate_loss(model, val_loader)
    writer.add_scalar("val/total_loss", val_loss, epoch)

    # Save
    ckpt_path = os.path.join(CKPT_DIR, f"epoch{epoch:03d}_valloss{val_loss:.4f}.pt")
    torch.save({
        "epoch": epoch,
        "model": model.state_dict(),
        "optimizer": optimizer.state_dict(),
        "classes": unique_classes,
        "val_loss": val_loss,
    }, ckpt_path)

    if SAVE_BEST_ONLY:
        if val_loss < best_val:
            best_val = val_loss
            torch.save(model.state_dict(), os.path.join(CKPT_DIR, "best.pt"))
    else:
        torch.save(model.state_dict(), os.path.join(CKPT_DIR, "last.pt"))

# 훈련 종료 마커 (GUI가 감지하도록 텍스트 로그 남김)
writer.add_text("status/final", f"done@{time.strftime('%Y-%m-%d %H:%M:%S')}")
writer.close()

print(f"[DONE] Logs: {LOGDIR}")
print(f"[TIP] tensorboard --logdir={LOG_BASE} --port=6007")



[INFO] Classes: ['1']
[INFO] #Images(valid): 216 / total 216


Train 1/20:   0%|          | 0/97 [00:05<?, ?it/s]


RuntimeError: DataLoader worker (pid(s) 13168, 19424, 17148, 13540) exited unexpectedly

In [14]:
import os
import time
import glob
import xml.etree.ElementTree as ET
from typing import List, Tuple

from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import functional as F
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

# =====================
# CONFIG (edit here)
# =====================
DATA_DIR = r"D:\AI_SVT_Training_mk\annotations\annos"   # jpg/png + VOC xml가 함께 있는 폴더
LOG_BASE = r"D:\AI_SVT_Training_mk\train_result\pytorch_det"  # TensorBoard 로그 + 체크포인트 출력 기본 폴더
RUN_NAME = time.strftime("run_%Y%m%d_%H%M%S")
LOGDIR = os.path.join(LOG_BASE, RUN_NAME)
CKPT_DIR = os.path.join(LOGDIR, "checkpoints")

EPOCHS = 20
BATCH_SIZE = 4                 # GPU 메모리에 맞게 조절
LR = 1e-4
NUM_WORKERS = 0  # Jupyter/Windows에서 DataLoader 다중 프로세스 충돌 방지. 스크립트 실행 시 4~8로 올리세요.
VAL_SPLIT = 0.1                # 10% 검증
MIN_BOX_SIZE = 1               # 너무 작은 박스 제거(px)
SAVE_BEST_ONLY = True
SEED = 42

# =====================================
# Utility: set seed
# =====================================
def set_seed(seed: int = 42):
    import random
    import numpy as np
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(SEED)

os.makedirs(LOGDIR, exist_ok=True)
os.makedirs(CKPT_DIR, exist_ok=True)

IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp"}

def find_pairs(data_dir: str) -> List[Tuple[str, str]]:
    xmls = sorted(glob.glob(os.path.join(data_dir, "*.xml")))
    pairs = []
    for xml in xmls:
        stem = os.path.splitext(os.path.basename(xml))[0]
        img_path = None
        for ext in IMG_EXTS:
            cand = os.path.join(data_dir, stem + ext)
            if os.path.exists(cand):
                img_path = cand
                break
        if img_path is not None:
            pairs.append((img_path, xml))
    return pairs


def xml_has_valid_object(xml_path: str) -> bool:
    try:
        root = ET.parse(xml_path).getroot()
    except Exception:
        return False
    for obj in root.findall("object"):
        bnd = obj.find("bndbox")
        if bnd is None:
            continue
        try:
            xmin = float(bnd.findtext("xmin"))
            ymin = float(bnd.findtext("ymin"))
            xmax = float(bnd.findtext("xmax"))
            ymax = float(bnd.findtext("ymax"))
        except Exception:
            continue
        if (xmax - xmin) >= MIN_BOX_SIZE and (ymax - ymin) >= MIN_BOX_SIZE:
            return True
    return False

pairs_all = find_pairs(DATA_DIR)
# 학습 중 빈 타겟 샘플로 인한 크래시를 방지하기 위해, 미리 필터링
pairs = [(img, xml) for (img, xml) in pairs_all if xml_has_valid_object(xml)]
if len(pairs) == 0:
    raise FileNotFoundError(f"No valid (image, xml) pairs with at least one box in: {DATA_DIR}")

# gather class names from filtered pairs만 사용
class_names = []
for _, xml in pairs:
    root = ET.parse(xml).getroot()
    for obj in root.findall("object"):
        name = obj.findtext("name")
        if name:
            class_names.append(name.strip())

unique_classes = sorted(set(class_names))
if not unique_classes:
    raise RuntimeError("No object classes found in XMLs after filtering.")

# background=0 rule (torchvision detection)
class_to_idx = {c: i + 1 for i, c in enumerate(unique_classes)}
idx_to_class = {i + 1: c for i, c in enumerate(unique_classes)}

with open(os.path.join(LOGDIR, "classes.txt"), "w", encoding="utf-8") as f:
    for c in unique_classes:
        f.write(f"{c}\n")

print("[INFO] Classes:", unique_classes)
print("[INFO] #Images(valid):", len(pairs), f"/ total {len(pairs_all)}")

# =====================================
# Dataset
# =====================================
class VOCDataset(Dataset):
    def __init__(self, pairs, transforms=None):
        self.pairs = pairs
        self.transforms = transforms

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        img_path, xml_path = self.pairs[idx]
        img = Image.open(img_path).convert("RGB")
        w, h = img.size

        boxes = []
        labels = []
        iscrowd = []

        root = ET.parse(xml_path).getroot()
        for obj in root.findall("object"):
            name = obj.findtext("name").strip()
            if name not in class_to_idx:
                continue
            bnd = obj.find("bndbox")
            if bnd is None:
                continue
            xmin = float(bnd.findtext("xmin"))
            ymin = float(bnd.findtext("ymin"))
            xmax = float(bnd.findtext("xmax"))
            ymax = float(bnd.findtext("ymax"))

            # clamp
            xmin = max(0, min(xmin, w - 1))
            ymin = max(0, min(ymin, h - 1))
            xmax = max(0, min(xmax, w - 1))
            ymax = max(0, min(ymax, h - 1))

            if xmax - xmin < MIN_BOX_SIZE or ymax - ymin < MIN_BOX_SIZE:
                continue

            boxes.append([xmin, ymin, xmax, ymax])
            labels.append(class_to_idx[name])

            diff = obj.findtext("difficult")
            iscrowd.append(1 if (diff and diff.strip() == "1") else 0)

        # 0개 박스 샘플은 미리 필터링했으므로 여기선 반드시 >=1
        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64)
        iscrowd = torch.tensor(iscrowd, dtype=torch.int64)

        target = {
            "boxes": boxes,
            "labels": labels,
            "iscrowd": iscrowd,
            "image_id": torch.tensor([idx], dtype=torch.int64),
        }

        img_t = F.pil_to_tensor(img).float() / 255.0
        if self.transforms:
            img_t = self.transforms(img_t)
        return img_t, target


def collate_fn(batch):
    imgs, tgts = list(zip(*batch))
    return list(imgs), list(tgts)

# =====================================
# Split train/val
# =====================================
from sklearn.model_selection import train_test_split
train_pairs, val_pairs = train_test_split(pairs, test_size=VAL_SPLIT, random_state=SEED, shuffle=True)

train_ds = VOCDataset(train_pairs)
val_ds   = VOCDataset(val_pairs)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=NUM_WORKERS, pin_memory=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False,
                          num_workers=NUM_WORKERS, pin_memory=True, collate_fn=collate_fn)

# =====================================
# Model / Optim (LEGACY API for torchvision 0.12)
# =====================================
num_classes = 1 + len(unique_classes)  # background + K

# 구버전: weights 인자 없음. COCO 사전학습 가중치 로드 → head 교체
model = fasterrcnn_resnet50_fpn(pretrained=True)  # backbone+RPN+head가 COCO용
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)  # 새 head로 교체

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.AdamW(params, lr=LR)

# =====================================
# TensorBoard writer
# =====================================
writer = SummaryWriter(LOGDIR)
with open(os.path.join(LOGDIR, "readme.txt"), "w", encoding="utf-8") as f:
    f.write(f"DATA_DIR={DATA_DIR}\n")
    f.write(f"classes={unique_classes}\n")
    f.write("legacy api: torchvision 0.12 / torch 1.11\n")

# =====================================
# Train/Eval loops
# =====================================

def evaluate_loss(model, loader, max_iters=None):
    model.eval()
    total = 0.0
    count = 0
    with torch.no_grad():
        pbar = tqdm(loader, desc="Val", leave=False)
        for i, (images, targets) in enumerate(pbar):
            images = [im.to(device) for im in images]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            loss_dict = model(images, targets)
            loss = sum(v for v in loss_dict.values())
            total += loss.item()
            count += 1
            if max_iters and i + 1 >= max_iters:
                break
    return total / max(1, count)

best_val = float("inf")
step = 0

for epoch in range(1, EPOCHS + 1):
    model.train()
    pbar = tqdm(train_loader, desc=f"Train {epoch}/{EPOCHS}")
    for images, targets in pbar:
        images = [im.to(device) for im in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        loss = sum(v for v in loss_dict.values())

        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        # TB per-step
        writer.add_scalar("train/total_loss", loss.item(), step)
        for k, v in loss_dict.items():
            writer.add_scalar(f"train/{k}", v.item(), step)
        step += 1

        pbar.set_postfix(total_loss=f"{loss.item():.4f}")

    # Validation
    val_loss = evaluate_loss(model, val_loader)
    writer.add_scalar("val/total_loss", val_loss, epoch)

    # Save
    ckpt_path = os.path.join(CKPT_DIR, f"epoch{epoch:03d}_valloss{val_loss:.4f}.pt")
    torch.save({
        "epoch": epoch,
        "model": model.state_dict(),
        "optimizer": optimizer.state_dict(),
        "classes": unique_classes,
        "val_loss": val_loss,
    }, ckpt_path)

    if SAVE_BEST_ONLY:
        if val_loss < best_val:
            best_val = val_loss
            torch.save(model.state_dict(), os.path.join(CKPT_DIR, "best.pt"))
    else:
        torch.save(model.state_dict(), os.path.join(CKPT_DIR, "last.pt"))

# 훈련 종료 마커 (GUI가 감지하도록 텍스트 로그 남김)
writer.add_text("status/final", f"done@{time.strftime('%Y-%m-%d %H:%M:%S')}")
writer.close()

print(f"[DONE] Logs: {LOGDIR}")
print(f"[TIP] tensorboard --logdir={LOG_BASE} --port=6007")


[INFO] Classes: ['1']
[INFO] #Images(valid): 216 / total 216


Train 1/20: 100%|██████████| 49/49 [00:33<00:00,  1.46it/s, total_loss=0.1710]
                                          

AttributeError: 'list' object has no attribute 'values'

In [16]:
import os
import time
import glob
import xml.etree.ElementTree as ET
from typing import List, Tuple

from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import functional as F
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

# =====================
# CONFIG (edit here)
# =====================
DATA_DIR = r"D:\AI_SVT_Training_mk\annotations\annos"   # jpg/png + VOC xml가 함께 있는 폴더
LOG_BASE = r"D:\AI_SVT_Training_mk\train_result\pytorch_det"  # TensorBoard 로그 + 체크포인트 출력 기본 폴더
RUN_NAME = time.strftime("run_%Y%m%d_%H%M%S")
LOGDIR = os.path.join(LOG_BASE, RUN_NAME)
CKPT_DIR = os.path.join(LOGDIR, "checkpoints")

EPOCHS = 20
BATCH_SIZE = 4                 # GPU 메모리에 맞게 조절
LR = 1e-4
NUM_WORKERS = 0  # Jupyter/Windows에서 DataLoader 다중 프로세스 충돌 방지. 스크립트 실행 시 4~8로 올리세요.
VAL_SPLIT = 0.1                # 10% 검증
MIN_BOX_SIZE = 1               # 너무 작은 박스 제거(px)
SAVE_BEST_ONLY = True
SEED = 42

# =====================================
# Utility: set seed
# =====================================
def set_seed(seed: int = 42):
    import random
    import numpy as np
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(SEED)

os.makedirs(LOGDIR, exist_ok=True)
os.makedirs(CKPT_DIR, exist_ok=True)

IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp"}

def find_pairs(data_dir: str) -> List[Tuple[str, str]]:
    xmls = sorted(glob.glob(os.path.join(data_dir, "*.xml")))
    pairs = []
    for xml in xmls:
        stem = os.path.splitext(os.path.basename(xml))[0]
        img_path = None
        for ext in IMG_EXTS:
            cand = os.path.join(data_dir, stem + ext)
            if os.path.exists(cand):
                img_path = cand
                break
        if img_path is not None:
            pairs.append((img_path, xml))
    return pairs


def xml_has_valid_object(xml_path: str) -> bool:
    try:
        root = ET.parse(xml_path).getroot()
    except Exception:
        return False
    for obj in root.findall("object"):
        bnd = obj.find("bndbox")
        if bnd is None:
            continue
        try:
            xmin = float(bnd.findtext("xmin"))
            ymin = float(bnd.findtext("ymin"))
            xmax = float(bnd.findtext("xmax"))
            ymax = float(bnd.findtext("ymax"))
        except Exception:
            continue
        if (xmax - xmin) >= MIN_BOX_SIZE and (ymax - ymin) >= MIN_BOX_SIZE:
            return True
    return False

pairs_all = find_pairs(DATA_DIR)
# 학습 중 빈 타겟 샘플로 인한 크래시를 방지하기 위해, 미리 필터링
pairs = [(img, xml) for (img, xml) in pairs_all if xml_has_valid_object(xml)]
if len(pairs) == 0:
    raise FileNotFoundError(f"No valid (image, xml) pairs with at least one box in: {DATA_DIR}")

# gather class names from filtered pairs만 사용
class_names = []
for _, xml in pairs:
    root = ET.parse(xml).getroot()
    for obj in root.findall("object"):
        name = obj.findtext("name")
        if name:
            class_names.append(name.strip())

unique_classes = sorted(set(class_names))
if not unique_classes:
    raise RuntimeError("No object classes found in XMLs after filtering.")

# background=0 rule (torchvision detection)
class_to_idx = {c: i + 1 for i, c in enumerate(unique_classes)}
idx_to_class = {i + 1: c for i, c in enumerate(unique_classes)}

with open(os.path.join(LOGDIR, "classes.txt"), "w", encoding="utf-8") as f:
    for c in unique_classes:
        f.write(f"{c}\n")

print("[INFO] Classes:", unique_classes)
print("[INFO] #Images(valid):", len(pairs), f"/ total {len(pairs_all)}")

# =====================================
# Dataset
# =====================================
class VOCDataset(Dataset):
    def __init__(self, pairs, transforms=None):
        self.pairs = pairs
        self.transforms = transforms

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        img_path, xml_path = self.pairs[idx]
        img = Image.open(img_path).convert("RGB")
        w, h = img.size

        boxes = []
        labels = []
        iscrowd = []

        root = ET.parse(xml_path).getroot()
        for obj in root.findall("object"):
            name = obj.findtext("name").strip()
            if name not in class_to_idx:
                continue
            bnd = obj.find("bndbox")
            if bnd is None:
                continue
            xmin = float(bnd.findtext("xmin"))
            ymin = float(bnd.findtext("ymin"))
            xmax = float(bnd.findtext("xmax"))
            ymax = float(bnd.findtext("ymax"))

            # clamp
            xmin = max(0, min(xmin, w - 1))
            ymin = max(0, min(ymin, h - 1))
            xmax = max(0, min(xmax, w - 1))
            ymax = max(0, min(ymax, h - 1))

            if xmax - xmin < MIN_BOX_SIZE or ymax - ymin < MIN_BOX_SIZE:
                continue

            boxes.append([xmin, ymin, xmax, ymax])
            labels.append(class_to_idx[name])

            diff = obj.findtext("difficult")
            iscrowd.append(1 if (diff and diff.strip() == "1") else 0)

        # 0개 박스 샘플은 미리 필터링했으므로 여기선 반드시 >=1
        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64)
        iscrowd = torch.tensor(iscrowd, dtype=torch.int64)

        target = {
            "boxes": boxes,
            "labels": labels,
            "iscrowd": iscrowd,
            "image_id": torch.tensor([idx], dtype=torch.int64),
        }

        img_t = F.pil_to_tensor(img).float() / 255.0
        if self.transforms:
            img_t = self.transforms(img_t)
        return img_t, target


def collate_fn(batch):
    imgs, tgts = list(zip(*batch))
    return list(imgs), list(tgts)

# =====================================
# Split train/val
# =====================================
from sklearn.model_selection import train_test_split
train_pairs, val_pairs = train_test_split(pairs, test_size=VAL_SPLIT, random_state=SEED, shuffle=True)

train_ds = VOCDataset(train_pairs)
val_ds   = VOCDataset(val_pairs)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=NUM_WORKERS, pin_memory=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False,
                          num_workers=NUM_WORKERS, pin_memory=True, collate_fn=collate_fn)

# =====================================
# Model / Optim (LEGACY API for torchvision 0.12)
# =====================================
num_classes = 1 + len(unique_classes)  # background + K

# 구버전: weights 인자 없음. COCO 사전학습 가중치 로드 → head 교체
model = fasterrcnn_resnet50_fpn(pretrained=True)  # backbone+RPN+head가 COCO용
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)  # 새 head로 교체

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.AdamW(params, lr=LR)

# =====================================
# TensorBoard writer
# =====================================
writer = SummaryWriter(LOGDIR)
with open(os.path.join(LOGDIR, "readme.txt"), "w", encoding="utf-8") as f:
    f.write(f"DATA_DIR={DATA_DIR}\n")
    f.write(f"classes={unique_classes}\n")
    f.write("legacy api: torchvision 0.12 / torch 1.11\n")

# =====================================
# Train/Eval loops
# =====================================

def evaluate_loss(model, loader, max_iters=None):
    """Compute validation loss for torchvision detection models on torch==1.11/vision==0.12.
    IMPORTANT: losses are returned ONLY when the model is in train mode.
    So we temporarily switch to train(), run forward under no_grad(), then restore mode.
    """
    prev_training = model.training
    model.train()  # to get loss dict instead of detections list
    total = 0.0
    count = 0
    with torch.no_grad():
        pbar = tqdm(loader, desc="Val", leave=False)
        for i, (images, targets) in enumerate(pbar):
            images = [im.to(device) for im in images]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            loss_dict = model(images, targets)
            # loss_dict is a dict in train() mode
            loss = sum(v for v in loss_dict.values())
            total += float(loss.item())
            count += 1
            if max_iters and i + 1 >= max_iters:
                break
    # restore previous mode
    model.train(prev_training)
    return total / max(1, count)

best_val = float("inf")
step = 0

for epoch in range(1, EPOCHS + 1):
    model.train()
    pbar = tqdm(train_loader, desc=f"Train {epoch}/{EPOCHS}")
    for images, targets in pbar:
        images = [im.to(device) for im in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        loss = sum(v for v in loss_dict.values())

        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        # TB per-step
        writer.add_scalar("train/total_loss", loss.item(), step)
        for k, v in loss_dict.items():
            writer.add_scalar(f"train/{k}", v.item(), step)
        step += 1

        pbar.set_postfix(total_loss=f"{loss.item():.4f}")

    # Validation
    val_loss = evaluate_loss(model, val_loader)
    writer.add_scalar("val/total_loss", val_loss, epoch)

    # Save
    ckpt_path = os.path.join(CKPT_DIR, f"epoch{epoch:03d}_valloss{val_loss:.4f}.pt")
    torch.save({
        "epoch": epoch,
        "model": model.state_dict(),
        "optimizer": optimizer.state_dict(),
        "classes": unique_classes,
        "val_loss": val_loss,
    }, ckpt_path)

    if SAVE_BEST_ONLY:
        if val_loss < best_val:
            best_val = val_loss
            torch.save(model.state_dict(), os.path.join(CKPT_DIR, "best.pt"))
    else:
        torch.save(model.state_dict(), os.path.join(CKPT_DIR, "last.pt"))

# 훈련 종료 마커 (GUI가 감지하도록 텍스트 로그 남김)
writer.add_text("status/final", f"done@{time.strftime('%Y-%m-%d %H:%M:%S')}")
writer.close()

print(f"[DONE] Logs: {LOGDIR}")
print(f"[TIP] tensorboard --logdir={LOG_BASE} --port=6007")


[INFO] Classes: ['1']
[INFO] #Images(valid): 216 / total 216


Train 1/20: 100%|██████████| 49/49 [00:18<00:00,  2.68it/s, total_loss=0.1342]
Train 2/20: 100%|██████████| 49/49 [00:18<00:00,  2.70it/s, total_loss=0.0895]
Train 3/20: 100%|██████████| 49/49 [00:18<00:00,  2.65it/s, total_loss=0.0844]
Train 4/20: 100%|██████████| 49/49 [00:18<00:00,  2.67it/s, total_loss=0.0577]
Train 5/20: 100%|██████████| 49/49 [00:18<00:00,  2.68it/s, total_loss=0.0526]
Train 6/20: 100%|██████████| 49/49 [00:18<00:00,  2.72it/s, total_loss=0.0860]
Train 7/20: 100%|██████████| 49/49 [00:18<00:00,  2.72it/s, total_loss=0.0759]
Train 8/20: 100%|██████████| 49/49 [00:18<00:00,  2.69it/s, total_loss=0.0468]
Train 9/20: 100%|██████████| 49/49 [00:18<00:00,  2.69it/s, total_loss=0.0391]
Train 10/20: 100%|██████████| 49/49 [00:18<00:00,  2.69it/s, total_loss=0.0358]
Train 11/20: 100%|██████████| 49/49 [00:18<00:00,  2.69it/s, total_loss=0.0446]
Train 12/20: 100%|██████████| 49/49 [00:17<00:00,  2.73it/s, total_loss=0.0306]
Train 13/20: 100%|██████████| 49/49 [00:18<00:00,

[DONE] Logs: D:\AI_SVT_Training_mk\train_result\pytorch_det\run_20251106_153757
[TIP] tensorboard --logdir=D:\AI_SVT_Training_mk\train_result\pytorch_det --port=6007


In [1]:
import os
import time
import glob
import xml.etree.ElementTree as ET
from typing import List, Tuple

from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import functional as F
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.utils import draw_bounding_boxes, make_grid
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

# =====================
# CONFIG (edit here)
# =====================
DATA_DIR = r"D:\AI_SVT_Training_mk\annotations\annos"   # jpg/png + VOC xml가 함께 있는 폴더
LOG_BASE = r"D:\AI_SVT_Training_mk\train_result\pytorch_det"  # TensorBoard 로그 + 체크포인트 출력 기본 폴더
RUN_NAME = time.strftime("run_%Y%m%d_%H%M%S")
LOGDIR = os.path.join(LOG_BASE, RUN_NAME)
CKPT_DIR = os.path.join(LOGDIR, "checkpoints")

EPOCHS = 20
BATCH_SIZE = 4                 # GPU 메모리에 맞게 조절
LR = 1e-4
NUM_WORKERS = 0  # Jupyter/Windows에서 DataLoader 다중 프로세스 충돌 방지. 스크립트 실행 시 4~8로 올리세요.
VAL_SPLIT = 0.1                # 10% 검증
MIN_BOX_SIZE = 1               # 너무 작은 박스 제거(px)
SAVE_BEST_ONLY = True
SEED = 42

# =====================================
# Utility: set seed
# =====================================
def set_seed(seed: int = 42):
    import random
    import numpy as np
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(SEED)

os.makedirs(LOGDIR, exist_ok=True)
os.makedirs(CKPT_DIR, exist_ok=True)

IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp"}

def find_pairs(data_dir: str) -> List[Tuple[str, str]]:
    xmls = sorted(glob.glob(os.path.join(data_dir, "*.xml")))
    pairs = []
    for xml in xmls:
        stem = os.path.splitext(os.path.basename(xml))[0]
        img_path = None
        for ext in IMG_EXTS:
            cand = os.path.join(data_dir, stem + ext)
            if os.path.exists(cand):
                img_path = cand
                break
        if img_path is not None:
            pairs.append((img_path, xml))
    return pairs


def xml_has_valid_object(xml_path: str) -> bool:
    try:
        root = ET.parse(xml_path).getroot()
    except Exception:
        return False
    for obj in root.findall("object"):
        bnd = obj.find("bndbox")
        if bnd is None:
            continue
        try:
            xmin = float(bnd.findtext("xmin"))
            ymin = float(bnd.findtext("ymin"))
            xmax = float(bnd.findtext("xmax"))
            ymax = float(bnd.findtext("ymax"))
        except Exception:
            continue
        if (xmax - xmin) >= MIN_BOX_SIZE and (ymax - ymin) >= MIN_BOX_SIZE:
            return True
    return False

pairs_all = find_pairs(DATA_DIR)
# 학습 중 빈 타겟 샘플로 인한 크래시를 방지하기 위해, 미리 필터링
pairs = [(img, xml) for (img, xml) in pairs_all if xml_has_valid_object(xml)]
if len(pairs) == 0:
    raise FileNotFoundError(f"No valid (image, xml) pairs with at least one box in: {DATA_DIR}")

# gather class names from filtered pairs만 사용
class_names = []
for _, xml in pairs:
    root = ET.parse(xml).getroot()
    for obj in root.findall("object"):
        name = obj.findtext("name")
        if name:
            class_names.append(name.strip())

unique_classes = sorted(set(class_names))
if not unique_classes:
    raise RuntimeError("No object classes found in XMLs after filtering.")

# background=0 rule (torchvision detection)
class_to_idx = {c: i + 1 for i, c in enumerate(unique_classes)}
idx_to_class = {i + 1: c for i, c in enumerate(unique_classes)}

with open(os.path.join(LOGDIR, "classes.txt"), "w", encoding="utf-8") as f:
    for c in unique_classes:
        f.write(f"{c}\n")

print("[INFO] Classes:", unique_classes)
print("[INFO] #Images(valid):", len(pairs), f"/ total {len(pairs_all)}")

# =====================================
# Dataset
# =====================================
class VOCDataset(Dataset):
    def __init__(self, pairs, transforms=None):
        self.pairs = pairs
        self.transforms = transforms

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        img_path, xml_path = self.pairs[idx]
        img = Image.open(img_path).convert("RGB")
        w, h = img.size

        boxes = []
        labels = []
        iscrowd = []

        root = ET.parse(xml_path).getroot()
        for obj in root.findall("object"):
            name = obj.findtext("name").strip()
            if name not in class_to_idx:
                continue
            bnd = obj.find("bndbox")
            if bnd is None:
                continue
            xmin = float(bnd.findtext("xmin"))
            ymin = float(bnd.findtext("ymin"))
            xmax = float(bnd.findtext("xmax"))
            ymax = float(bnd.findtext("ymax"))

            # clamp
            xmin = max(0, min(xmin, w - 1))
            ymin = max(0, min(ymin, h - 1))
            xmax = max(0, min(xmax, w - 1))
            ymax = max(0, min(ymax, h - 1))

            if xmax - xmin < MIN_BOX_SIZE or ymax - ymin < MIN_BOX_SIZE:
                continue

            boxes.append([xmin, ymin, xmax, ymax])
            labels.append(class_to_idx[name])

            diff = obj.findtext("difficult")
            iscrowd.append(1 if (diff and diff.strip() == "1") else 0)

        # 0개 박스 샘플은 미리 필터링했으므로 여기선 반드시 >=1
        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64)
        iscrowd = torch.tensor(iscrowd, dtype=torch.int64)

        target = {
            "boxes": boxes,
            "labels": labels,
            "iscrowd": iscrowd,
            "image_id": torch.tensor([idx], dtype=torch.int64),
        }

        img_t = F.pil_to_tensor(img).float() / 255.0
        if self.transforms:
            img_t = self.transforms(img_t)
        return img_t, target


def collate_fn(batch):
    imgs, tgts = list(zip(*batch))
    return list(imgs), list(tgts)

# =====================================
# Split train/val
# =====================================
from sklearn.model_selection import train_test_split
train_pairs, val_pairs = train_test_split(pairs, test_size=VAL_SPLIT, random_state=SEED, shuffle=True)

train_ds = VOCDataset(train_pairs)
val_ds   = VOCDataset(val_pairs)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=NUM_WORKERS, pin_memory=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False,
                          num_workers=NUM_WORKERS, pin_memory=True, collate_fn=collate_fn)

# =====================================
# Model / Optim (LEGACY API for torchvision 0.12)
# =====================================
num_classes = 1 + len(unique_classes)  # background + K

# 구버전: weights 인자 없음. COCO 사전학습 가중치 로드 → head 교체
model = fasterrcnn_resnet50_fpn(pretrained=True)  # backbone+RPN+head가 COCO용
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)  # 새 head로 교체

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.AdamW(params, lr=LR)

# =====================================
# TensorBoard writer
# =====================================
writer = SummaryWriter(LOGDIR)
with open(os.path.join(LOGDIR, "readme.txt"), "w", encoding="utf-8") as f:
    f.write(f"DATA_DIR={DATA_DIR}\n")
    f.write(f"classes={unique_classes}\n")
    f.write("legacy api: torchvision 0.12 / torch 1.11\n")

# =====================================
# Train/Eval loops
# =====================================

def evaluate_loss(model, loader, max_iters=None):
    """Compute validation loss for torchvision detection models on torch==1.11/vision==0.12.
    IMPORTANT: losses are returned ONLY when the model is in train mode.
    So we temporarily switch to train(), run forward under no_grad(), then restore mode.
    """
    prev_training = model.training
    model.train()  # to get loss dict instead of detections list
    total = 0.0
    count = 0
    with torch.no_grad():
        pbar = tqdm(loader, desc="Val", leave=False)
        for i, (images, targets) in enumerate(pbar):
            images = [im.to(device) for im in images]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            loss_dict = model(images, targets)
            # loss_dict is a dict in train() mode
            loss = sum(v for v in loss_dict.values())
            total += float(loss.item())
            count += 1
            if max_iters and i + 1 >= max_iters:
                break
    # restore previous mode
    model.train(prev_training)
    return total / max(1, count)


def log_predictions_for_tb(model, loader, writer, epoch: int, device, score_thr: float = 0.5, max_images: int = 4):
    """Run model in eval() to get predictions and upload visualizations to TensorBoard.
    - Filters by score >= score_thr
    - Logs up to max_images images under tag 'predictions/*'
    """
    prev_training = model.training
    model.eval()
    logged = 0
    with torch.no_grad():
        for images, targets in loader:
            images = [im.to(device) for im in images]
            preds = model(images)  # list of dicts with boxes/scores/labels
            for img_t, pred in zip(images, preds):
                img = (img_t.clamp(0, 1) * 255).byte().cpu()
                boxes = pred.get("boxes", torch.empty((0, 4)))
                scores = pred.get("scores", torch.empty((0,)))
                labels = pred.get("labels", torch.empty((0,), dtype=torch.int64))
                if boxes.numel() == 0:
                    continue
                keep = scores >= score_thr
                boxes = boxes[keep].cpu()
                labels = labels[keep].cpu()
                scores = scores[keep].cpu()
                if boxes.numel() == 0:
                    continue
                # prepare string labels with class names and scores
                text_labels = []
                for l, s in zip(labels.tolist(), scores.tolist()):
                    name = idx_to_class.get(l, str(l))
                    text_labels.append(f"{name}:{s:.2f}")
                vis = draw_bounding_boxes(img, boxes=boxes, labels=text_labels, width=2)
                writer.add_image(f"predictions/val_image_{logged}", vis, epoch)
                logged += 1
                if logged >= max_images:
                    break
            if logged >= max_images:
                break
    # restore previous mode
    model.train(prev_training)

best_val = float("inf")
step = 0

for epoch in range(1, EPOCHS + 1):
    model.train()
    pbar = tqdm(train_loader, desc=f"Train {epoch}/{EPOCHS}")
    for images, targets in pbar:
        images = [im.to(device) for im in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        loss = sum(v for v in loss_dict.values())

        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        # TB per-step
        writer.add_scalar("train/total_loss", loss.item(), step)
        for k, v in loss_dict.items():
            writer.add_scalar(f"train/{k}", v.item(), step)
        step += 1

        pbar.set_postfix(total_loss=f"{loss.item():.4f}")

    # Validation
    val_loss = evaluate_loss(model, val_loader)
    writer.add_scalar("val/total_loss", val_loss, epoch)

    # Eval-mode predictions → TensorBoard 시각화 (상위 4장, score≥0.5)
    log_predictions_for_tb(model, val_loader, writer, epoch, device, score_thr=0.5, max_images=4)

    # Save
    ckpt_path = os.path.join(CKPT_DIR, f"epoch{epoch:03d}_valloss{val_loss:.4f}.pt")
    torch.save({
        "epoch": epoch,
        "model": model.state_dict(),
        "optimizer": optimizer.state_dict(),
        "classes": unique_classes,
        "val_loss": val_loss,
    }, ckpt_path)

    if SAVE_BEST_ONLY:
        if val_loss < best_val:
            best_val = val_loss
            torch.save(model.state_dict(), os.path.join(CKPT_DIR, "best.pt"))
    else:
        torch.save(model.state_dict(), os.path.join(CKPT_DIR, "last.pt"))

# 훈련 종료 마커 (GUI가 감지하도록 텍스트 로그 남김)
writer.add_text("status/final", f"done@{time.strftime('%Y-%m-%d %H:%M:%S')}")
writer.close()

print(f"[DONE] Logs: {LOGDIR}")
print(f"[TIP] tensorboard --logdir={LOG_BASE} --port=6007")


  from .autonotebook import tqdm as notebook_tqdm


[INFO] Classes: ['1']
[INFO] #Images(valid): 216 / total 216


Train 1/20: 100%|██████████| 49/49 [00:37<00:00,  1.32it/s, total_loss=0.1729]
Train 2/20: 100%|██████████| 49/49 [00:17<00:00,  2.83it/s, total_loss=0.0773]
Train 3/20: 100%|██████████| 49/49 [00:17<00:00,  2.83it/s, total_loss=0.0712]
Train 4/20: 100%|██████████| 49/49 [00:17<00:00,  2.85it/s, total_loss=0.0713]
Train 5/20: 100%|██████████| 49/49 [00:17<00:00,  2.82it/s, total_loss=0.0387]
Train 6/20: 100%|██████████| 49/49 [00:17<00:00,  2.82it/s, total_loss=0.0539]
Train 7/20: 100%|██████████| 49/49 [00:17<00:00,  2.82it/s, total_loss=0.0788]
Train 8/20: 100%|██████████| 49/49 [00:17<00:00,  2.83it/s, total_loss=0.0434]
Train 9/20: 100%|██████████| 49/49 [00:17<00:00,  2.83it/s, total_loss=0.0415]
Train 10/20: 100%|██████████| 49/49 [00:17<00:00,  2.83it/s, total_loss=0.0394]
Train 11/20: 100%|██████████| 49/49 [00:17<00:00,  2.83it/s, total_loss=0.0387]
Train 12/20: 100%|██████████| 49/49 [00:17<00:00,  2.83it/s, total_loss=0.0271]
Train 13/20: 100%|██████████| 49/49 [00:17<00:00,

[DONE] Logs: D:\AI_SVT_Training_mk\train_result\pytorch_det\run_20251106_162958
[TIP] tensorboard --logdir=D:\AI_SVT_Training_mk\train_result\pytorch_det --port=6007


In [4]:
"""
Export the best PyTorch Faster R-CNN model picked by TensorBoard metrics (legacy torch==1.11 / tv==0.12).
- Scans a run directory under LOG_BASE (or auto-picks the latest run) where the training script saved:
  - TensorBoard scalars: `val/total_loss` (and possibly other metrics in the future)
  - Checkpoints: `checkpoints/epochXXX_valloss*.pt` and (optionally) `checkpoints/best.pt`
- Chooses the best epoch by lowest `val/total_loss` (if you later log mAP, you can flip the selector).
- Rebuilds model with the correct num_classes using the `classes` saved in checkpoint.
- Exports a lightweight deployment package:
  * model_best.pt (state_dict)
  * classes.txt
  * best_epoch.json
  * model_config.json
  * inference_example.py  (simple example loader to run predictions on an images folder)

NOTE: TorchScript/ONNX export of torchvision detection models on torch 1.11 / tv 0.12 is brittle.
      This script intentionally exports state_dict and a reference inference script instead of JIT/ONNX.
"""
import os
import re
import json
import glob
import shutil
from datetime import datetime

import torch
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

# ========== CONFIG ==========
LOG_BASE = r"D:\AI_SVT_Training_mk\train_result\pytorch_det"  # parent folder containing run_YYYYMMDD_HHMMSS
RUN_DIR  = None  # if None, auto-pick the most recently modified run under LOG_BASE
SCALAR_KEY = "val/total_loss"  # metric to minimize
EXPORT_NAME = "exported_model"  # subfolder name inside RUN_DIR
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# ============================


def pick_latest_run(base):
    runs = []
    for d in glob.glob(os.path.join(base, "run_*")):
        if os.path.isdir(d):
            mtime = os.path.getmtime(d)
            runs.append((mtime, d))
    if not runs:
        raise FileNotFoundError(f"No run_* directory found under {base}")
    runs.sort(reverse=True)
    return runs[0][1]


def find_event_files(run_dir):
    event_files = glob.glob(os.path.join(run_dir, "events.out.tfevents.*"))
    if not event_files:
        # sometimes SummaryWriter creates subdirs; scan all
        event_files = glob.glob(os.path.join(run_dir, "**", "events.out.tfevents.*"), recursive=True)
    if not event_files:
        raise FileNotFoundError(f"No TensorBoard event files found in {run_dir}")
    return event_files


def load_scalar_series(event_file, tag):
    ea = EventAccumulator(event_file)
    ea.Reload()
    if tag not in ea.Tags().get('scalars', []):
        return []
    return [(e.step, e.value) for e in ea.Scalars(tag)]


def choose_best_epoch_by_scalar(event_files, tag):
    """Merge all TB files; take min value of the tag, return (best_step, best_value)."""
    all_points = []
    for ef in event_files:
        pts = load_scalar_series(ef, tag)
        all_points.extend(pts)
    if not all_points:
        raise RuntimeError(f"No scalar data for tag '{tag}' in event files.")
    # deduplicate by step keeping the last value
    by_step = {}
    for step, val in all_points:
        by_step[step] = val
    best_step = min(by_step, key=lambda s: by_step[s])
    return best_step, by_step[best_step]


def locate_checkpoint(run_dir, best_epoch):
    ckpt_dir = os.path.join(run_dir, "checkpoints")
    if not os.path.isdir(ckpt_dir):
        raise FileNotFoundError(f"Checkpoint dir not found: {ckpt_dir}")
    # direct best.pt takes precedence if exists
    best_pt = os.path.join(ckpt_dir, "best.pt")
    if os.path.exists(best_pt):
        return best_pt
    # otherwise pick epoch file
    pattern = os.path.join(ckpt_dir, f"epoch{best_epoch:03d}_valloss*.pt")
    matches = glob.glob(pattern)
    if matches:
        return matches[0]
    # fallback: last epoch file with the same epoch number (if naming changed)
    pattern = os.path.join(ckpt_dir, f"epoch{best_epoch:03d}_*.pt")
    matches = glob.glob(pattern)
    if matches:
        return matches[0]
    raise FileNotFoundError(f"No checkpoint found for epoch {best_epoch} in {ckpt_dir}")


def rebuild_model_from_ckpt(ckpt_path, device=DEVICE):
    ckpt = torch.load(ckpt_path, map_location="cpu")
    classes = ckpt.get("classes")
    if classes is None:
        raise RuntimeError("Checkpoint does not contain 'classes'. Re-train with the provided training script.")
    num_classes = 1 + len(classes)
    model = fasterrcnn_resnet50_fpn(pretrained=True)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    state = ckpt.get("model") or ckpt  # support plain state_dict or wrapped
    model.load_state_dict(state)
    model.eval().to(device)
    return model, classes


def export_package(run_dir, ckpt_path, best_info, model, classes):
    export_dir = os.path.join(run_dir, EXPORT_NAME)
    os.makedirs(export_dir, exist_ok=True)

    # Save state_dict (pure)
    state_path = os.path.join(export_dir, "model_best.pt")
    torch.save(model.state_dict(), state_path)

    # Save classes
    classes_txt_src = os.path.join(run_dir, "classes.txt")
    classes_txt_dst = os.path.join(export_dir, "classes.txt")
    if os.path.exists(classes_txt_src):
        shutil.copy2(classes_txt_src, classes_txt_dst)
    else:
        with open(classes_txt_dst, "w", encoding="utf-8") as f:
            for c in classes:
                f.write(f"{c}\n")

    # Save metadata
    meta = {
        "selected_by": SCALAR_KEY,
        "best_step": int(best_info[0]),
        "best_value": float(best_info[1]),
        "checkpoint": os.path.relpath(ckpt_path, run_dir),
        "export_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    }
    with open(os.path.join(export_dir, "best_epoch.json"), "w", encoding="utf-8") as f:
        json.dump(meta, f, ensure_ascii=False, indent=2)

    with open(os.path.join(export_dir, "model_config.json"), "w", encoding="utf-8") as f:
        json.dump({"num_classes": 1 + len(classes), "classes": classes}, f, ensure_ascii=False, indent=2)

    # Write a tiny inference helper
    infer_py = os.path.join(export_dir, "inference_example.py")
    with open(infer_py, "w", encoding="utf-8") as f:
        f.write(f"""# Minimal inference example for torchvision Faster R-CNN (torch 1.11 / tv 0.12)
import os, glob
from PIL import Image
import torch
from torchvision.transforms import functional as F
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

DEVICE = '{DEVICE}'
EXPORT_DIR = os.path.dirname(__file__)
STATE_DICT = os.path.join(EXPORT_DIR, 'model_best.pt')
CLASSES_TXT = os.path.join(EXPORT_DIR, 'classes.txt')

# load classes
classes = []
with open(CLASSES_TXT, 'r', encoding='utf-8') as f:
    for line in f:
        line=line.strip()
        if line:
            classes.append(line)
num_classes = 1 + len(classes)

# rebuild model
model = fasterrcnn_resnet50_fpn(pretrained=True)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
model.load_state_dict(torch.load(STATE_DICT, map_location='cpu'))
model.eval().to(DEVICE)

@torch.no_grad()
def predict_image(img_path, score_thr=0.5):
    img = Image.open(img_path).convert('RGB')
    x = F.pil_to_tensor(img).float()/255.0
    pred = model([x.to(DEVICE)])[0]
    keep = pred['scores'] >= score_thr
    return {
        'boxes': pred['boxes'][keep].cpu().tolist(),
        'scores': pred['scores'][keep].cpu().tolist(),
        'labels': [classes[int(l)-1] for l in pred['labels'][keep].cpu().tolist()],
    }

if __name__ == '__main__':
    test_dir = os.path.join(EXPORT_DIR, 'test_images')
    os.makedirs(test_dir, exist_ok=True)
    print(f"Put test images in: {test_dir}")
    for img in glob.glob(os.path.join(test_dir, '*.*')):
        try:
            out = predict_image(img)
            print(os.path.basename(img), out)
        except Exception as e:
            print('ERR', img, e)
""")

    print(f"[EXPORT] Package written to: {export_dir}")
    return export_dir


def main():
    run_dir = RUN_DIR or pick_latest_run(LOG_BASE)
    print(f"[RUN] Using run directory: {run_dir}")

    event_files = find_event_files(run_dir)
    best_step, best_val = choose_best_epoch_by_scalar(event_files, SCALAR_KEY)
    print(f"[PICK] Best by '{SCALAR_KEY}': step={best_step}, value={best_val:.6f}")

    # Our training logged val scalar per epoch with global_step==epoch
    best_epoch = int(best_step)
    ckpt_path = locate_checkpoint(run_dir, best_epoch)
    print(f"[CKPT] Using checkpoint: {ckpt_path}")

    model, classes = rebuild_model_from_ckpt(ckpt_path, device=DEVICE)
    export_dir = export_package(run_dir, ckpt_path, (best_step, best_val), model, classes)

    print("[DONE] Export complete. You can run inference_example.py to test.")


if __name__ == "__main__":
    main()


[RUN] Using run directory: D:\AI_SVT_Training_mk\train_result\pytorch_det\run_20251106_162958
[PICK] Best by 'val/total_loss': step=18, value=0.033118
[CKPT] Using checkpoint: D:\AI_SVT_Training_mk\train_result\pytorch_det\run_20251106_162958\checkpoints\best.pt


RuntimeError: Checkpoint does not contain 'classes'. Re-train with the provided training script.

In [3]:
"""
Make TF-style export folder for PyTorch Faster R-CNN (torch==1.11 / tv==0.12).
Creates: D:\AI_SVT_Training_mk\output_inference_graph_pytorch\saved_model\

Contents (always):
  - model_state_dict.pt        (weights)
  - classes.txt                (one label per line)
  - label_map.pbtxt            (TF-style label map for compatibility)
  - model_config.json          (num_classes, class names)
  - inference.py               (CLI/func to run inference on an image or folder)
  - README.txt                 (how to use)

Optional (best-effort; guarded):
  - model_scripted.ts          (TorchScript; may fail on this version)
  - model.onnx                 (ONNX; only if onnx is installed and export passes)

It selects the best checkpoint by reading TensorBoard scalars (val/total_loss),
falling back to best.pt or the latest epoch file if needed.
"""
import os
import json
import glob
import shutil
from datetime import datetime

import torch
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

# ========== CONFIG (edit) ==========
LOG_BASE = r"D:\AI_SVT_Training_mk\train_result\pytorch_det"
OUTPUT_ROOT = r"D:\AI_SVT_Training_mk\output_inference_graph_pytorch"
RUN_DIR = None               # None -> pick latest run under LOG_BASE
SCALAR_KEY = "val/total_loss"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# ===================================


def pick_latest_run(base):
    runs = sorted(
        [d for d in glob.glob(os.path.join(base, "run_*")) if os.path.isdir(d)],
        key=lambda p: os.path.getmtime(p),
        reverse=True,
    )
    if not runs:
        raise FileNotFoundError(f"No run_* directory found under {base}")
    return runs[0]


def find_event_files(run_dir):
    event_files = glob.glob(os.path.join(run_dir, "events.out.tfevents.*"))
    if not event_files:
        event_files = glob.glob(os.path.join(run_dir, "**", "events.out.tfevents.*"), recursive=True)
    return event_files


def choose_best_step(event_files, tag):
    points = []
    for ef in event_files:
        ea = EventAccumulator(ef)
        try:
            ea.Reload()
            if tag in ea.Tags().get('scalars', []):
                points.extend((e.step, e.value) for e in ea.Scalars(tag))
        except Exception:
            pass
    if not points:
        return None
    by_step = {}
    for s, v in points:
        by_step[s] = v
    best = min(by_step, key=lambda s: by_step[s])
    return int(best)


def locate_checkpoint(run_dir, best_epoch):
    ckpt_dir = os.path.join(run_dir, "checkpoints")
    if not os.path.isdir(ckpt_dir):
        raise FileNotFoundError(f"Checkpoint dir not found: {ckpt_dir}")
    cand = os.path.join(ckpt_dir, "best.pt")
    if os.path.exists(cand):
        return cand
    if best_epoch is not None:
        matches = glob.glob(os.path.join(ckpt_dir, f"epoch{best_epoch:03d}_*.pt"))
        if matches:
            return matches[0]
    # fallback: latest epoch file
    epoch_files = sorted(glob.glob(os.path.join(ckpt_dir, "epoch*_*.pt")), key=os.path.getmtime)
    if epoch_files:
        return epoch_files[-1]
    # fallback: any .pt
    others = sorted(glob.glob(os.path.join(ckpt_dir, "*.pt")), key=os.path.getmtime)
    if others:
        return others[-1]
    raise FileNotFoundError("No checkpoint .pt found in checkpoints/")


def read_classes(run_dir, state):
    # 1) classes.txt
    cls_txt = os.path.join(run_dir, "classes.txt")
    if os.path.exists(cls_txt):
        with open(cls_txt, "r", encoding="utf-8") as f:
            return [line.strip() for line in f if line.strip()]
    # 2) infer from state dict predictor head
    if isinstance(state, dict):
        for k in [
            "roi_heads.box_predictor.cls_score.weight",
            "module.roi_heads.box_predictor.cls_score.weight",
        ]:
            if k in state:
                out_ch = state[k].shape[0]
                k_classes = out_ch - 1  # background included
                return [f"class_{i}" for i in range(1, k_classes + 1)]
    return None


def rebuild_model(num_classes, state_dict):
    model = fasterrcnn_resnet50_fpn(pretrained=True)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    model.load_state_dict(state_dict, strict=False)
    model.eval().to(DEVICE)
    return model


def make_label_map_pbtxt(classes):
    lines = []
    for i, name in enumerate(classes, start=1):
        lines.append("item {")
        lines.append(f"  id: {i}")
        lines.append(f"  name: '{name}'")
        lines.append("}")
    return "\n".join(lines) + "\n"


def safe_torchscript(model, example):
    """Best-effort TorchScript. Returns path or None."""
    try:
        scripted = torch.jit.trace(model, [example])  # tracing list input
        return scripted
    except Exception:
        return None


def safe_export_onnx(model, example, out_path):
    try:
        import onnx  # noqa
        torch.onnx.export(
            model,
            (example,),
            out_path,
            input_names=["images"],
            output_names=["detections"],
            opset_version=12,
            do_constant_folding=True,
        )
        return True
    except Exception:
        return False


def main():
    run_dir = RUN_DIR or pick_latest_run(LOG_BASE)
    print(f"[RUN] {run_dir}")
    event_files = find_event_files(run_dir)
    best_epoch = choose_best_step(event_files, SCALAR_KEY)
    print(f"[BEST] epoch (by {SCALAR_KEY} min): {best_epoch}")

    ckpt_path = locate_checkpoint(run_dir, best_epoch)
    print(f"[CKPT] {ckpt_path}")
    raw = torch.load(ckpt_path, map_location="cpu")

    # --- robust state_dict extraction ---
    def _extract_state_dict(obj):
        # common wrappers
        if isinstance(obj, dict):
            for k in ["model", "state_dict", "model_state_dict", "model_state", "net", "module"]:
                v = obj.get(k, None)
                if isinstance(v, dict):
                    return v
            # maybe it's already a state_dict (tensor leaves)
            if any(isinstance(v, torch.Tensor) for v in obj.values()):
                return obj
        # unknown structure; return as-is (may raise later)
        return obj

    state = _extract_state_dict(raw)

    classes = read_classes(run_dir, state)
    if state is None or (isinstance(state, dict) and len(state) == 0):
        raise RuntimeError("Checkpoint does not contain a valid state_dict. Make sure training saved weights correctly.")
    if not classes:
        raise RuntimeError("Unable to determine classes. Ensure classes.txt exists or re-train with the provided script.")

    num_classes = 1 + len(classes)
    model = rebuild_model(num_classes, state)

    # ----- prepare output folder -----
    save_dir = os.path.join(OUTPUT_ROOT, "saved_model")
    os.makedirs(save_dir, exist_ok=True)

    # 1) state_dict
    torch.save(model.state_dict(), os.path.join(save_dir, "model_state_dict.pt"))

    # 2) classes files
    with open(os.path.join(save_dir, "classes.txt"), "w", encoding="utf-8") as f:
        for c in classes:
            f.write(f"{c}\n")
    with open(os.path.join(save_dir, "label_map.pbtxt"), "w", encoding="utf-8") as f:
        f.write(make_label_map_pbtxt(classes))

    # 3) config/metadata
    meta = {
        "export_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "device": DEVICE,
        "num_classes": num_classes,
        "classes": classes,
        "source_run": os.path.basename(run_dir),
        "checkpoint": os.path.basename(ckpt_path),
    }
    with open(os.path.join(save_dir, "model_config.json"), "w", encoding="utf-8") as f:
        json.dump(meta, f, ensure_ascii=False, indent=2)

    # 4) inference helper script
    infer_py = os.path.join(save_dir, "inference.py")
    with open(infer_py, "w", encoding="utf-8") as f:
        f.write("""# Inference helper for exported PyTorch Faster R-CNN (torch 1.11 / tv 0.12)
import os, glob, json
from PIL import Image
import torch
from torchvision.transforms import functional as F
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

# pick device at runtime to avoid templating issues
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BASE = os.path.dirname(__file__)
STATE_DICT = os.path.join(BASE, 'model_state_dict.pt')
CONFIG_JSON = os.path.join(BASE, 'model_config.json')

with open(CONFIG_JSON, 'r', encoding='utf-8') as f:
    cfg = json.load(f)
classes = cfg['classes']
num_classes = cfg['num_classes']

model = fasterrcnn_resnet50_fpn(pretrained=True)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
model.load_state_dict(torch.load(STATE_DICT, map_location='cpu'))
model.eval().to(DEVICE)

@torch.no_grad()
def predict_image(img_path, score_thr=0.5):
    img = Image.open(img_path).convert('RGB')
    x = F.pil_to_tensor(img).float()/255.0
    pred = model([x.to(DEVICE)])[0]
    keep = pred['scores'] >= score_thr
    return {
        'boxes': pred['boxes'][keep].cpu().tolist(),
        'scores': pred['scores'][keep].cpu().tolist(),
        'labels': [classes[int(l)-1] for l in pred['labels'][keep].cpu().tolist()],
    }

if __name__ == '__main__':
    test_dir = os.path.join(BASE, 'test_images')
    os.makedirs(test_dir, exist_ok=True)
    print("Put test images in:", test_dir)
    for img in glob.glob(os.path.join(test_dir, '*.*')):
        try:
            out = predict_image(img)
            print(os.path.basename(img), out)
        except Exception as e:
            print('ERR', img, e)
""")

    # 5) README
    with open(os.path.join(save_dir, "README.txt"), "w", encoding="utf-8") as f:
        f.write(
            "This folder mimics TensorFlow's output_inference_graph/saved_model for PyTorch.\n"
            "Always-available files: model_state_dict.pt, classes.txt, label_map.pbtxt, model_config.json, inference.py.\n"
            "Optional scripted/ONNX files are best-effort on torch 1.11 / tv 0.12.\n"
        )

    # ----- optional exports -----
    example = torch.rand(3, 480, 640)  # CHW image
    scripted = safe_torchscript(model, [example])
    if scripted is not None:
        scripted.save(os.path.join(save_dir, "model_scripted.ts"))
        print("[TS] TorchScript exported -> model_scripted.ts")
    else:
        print("[TS] TorchScript export skipped (not supported on this build)")

    onnx_path = os.path.join(save_dir, "model.onnx")
    if safe_export_onnx(model, example, onnx_path):
        print("[ONNX] Exported -> model.onnx")
    else:
        print("[ONNX] Export skipped (package missing or export failed)")

    print(f"[DONE] Exported to: {save_dir}")


if __name__ == '__main__':
    main()


[RUN] D:\AI_SVT_Training_mk\train_result\pytorch_det\run_20251106_162958
[BEST] epoch (by val/total_loss min): 18
[CKPT] D:\AI_SVT_Training_mk\train_result\pytorch_det\run_20251106_162958\checkpoints\best.pt


  (torch.floor((input.size(i + 2).float() * torch.tensor(scale_factors[i], dtype=torch.float32)).float()))


[TS] TorchScript export skipped (not supported on this build)
[ONNX] Export skipped (package missing or export failed)
[DONE] Exported to: D:\AI_SVT_Training_mk\output_inference_graph_pytorch\saved_model


- model_state_dict.pt : PyTorch 가중치(state_dict)

- classes.txt : 한 줄당 1개 클래스

- label_map.pbtxt : TF 호환 라벨맵 (툴 체인 호환용)

- model_config.json : 클래스 수/이름, 소스 런/체크포인트 정보

- inference.py : 이미지/폴더 추론 헬퍼 스크립트 (CLI로 바로 사용 가능)

- README.txt : 사용법 요약

11.07

In [4]:
# -*- coding: utf-8 -*-
import os
import re
import shutil
from typing import Optional, Dict, Any, List

class CheckpointManager:
    """
    PyTorch 체크포인트 보관 정책:
      - best 기준: metric이 좋아지면 저장, best.pth 갱신
      - last 기준: 매 스텝/에폭 끝에서 최신 저장
      - special: 특정 스텝(예: x*0.7) 통과 시 강제 저장
      - max_to_keep: 최근 N개만 유지 (None이면 제한 없음)
    파일명 규칙 예: ckpt_step=001234_metric=0.512.pth
    """
    def __init__(self, ckpt_dir: str, max_to_keep: Optional[int] = 20):
        self.ckpt_dir = ckpt_dir
        os.makedirs(self.ckpt_dir, exist_ok=True)
        self.max_to_keep = max_to_keep
        self._fname_re = re.compile(r"ckpt_step=(\d+)_metric=([0-9.]+)\.pth")

    def _list_ckpts(self) -> List[str]:
        return sorted([f for f in os.listdir(self.ckpt_dir) if f.endswith(".pth") and f.startswith("ckpt_step=")])

    def _prune(self):
        if self.max_to_keep is None:
            return
        files = self._list_ckpts()
        excess = len(files) - self.max_to_keep
        for i in range(excess):
            try:
                os.remove(os.path.join(self.ckpt_dir, files[i]))
            except FileNotFoundError:
                pass

    def save(self, step: int, metric: float, state: Dict[str, Any], tag: Optional[str] = None):
        fname = f"ckpt_step={step:06d}_metric={metric:.6f}.pth"
        fpath = os.path.join(self.ckpt_dir, fname)
        torch_save = state.pop("_torch_save", None)  # (희소 확장 포인트)
        if torch_save is not None:
            # 커스텀 저장 함수가 오면 사용
            torch_save(fpath, state)
        else:
            import torch
            torch.save(state, fpath)

        # last.pth 복사
        shutil.copy2(fpath, os.path.join(self.ckpt_dir, "last.pth"))

        # best 태그가 오면 best.pth 갱신
        if tag == "best":
            shutil.copy2(fpath, os.path.join(self.ckpt_dir, "best.pth"))

        # 개수 제한
        self._prune()
        return fpath


In [16]:
# -*- coding: utf-8 -*-
"""
PyTorch 1.11 훈련 스켈레톤:
 - TF의 x*0.7 시점 강제 저장
 - TensorBoard 로깅
 - best/last 체크포인트 관리
 - 하이퍼파라미터 덤프
"""
import os, json, time, yaml, math, random
from dataclasses import dataclass, asdict
from typing import Dict, Any
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

import sys, os
sys.path.insert(0, os.path.abspath(".."))  # 프로젝트 루트로 조정

from utils.checkpoint_manager import CheckpointManager

# -----------------------
# 0) 하이퍼파라미터/설정
# -----------------------
@dataclass
class TrainConfig:
    output_root: str = r"D:\AI_SVT_Training_mk\output_inference_graph_pytorch"
    max_steps: int = 10000             # TF에서 쓰시던 X
    batch_size: int = 8
    base_lr: float = 1e-3
    weight_decay: float = 1e-4
    seed: int = 42
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    metric_name: str = "val/mAP"       # 베스트 판별 지표 이름
    save_fraction: float = 0.7         # x*0.7 지점
    ckpt_max_to_keep: int = 20         # ★ 여기 때문에 "20개만" 남습니다. 바꾸세요.
    log_every: int = 50
    eval_every: int = 200              # 간단 검증 주기(데모)
    model_name: str = "resnet50_frcnn" # 예시 표기용

def set_seed(seed: int):
    random.seed(seed); torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)

# -----------------------
# 1) 예시 모델 (분류기로 데모)
#    → 실제 FRCNN/검출모델로 교체하세요.
# -----------------------
class TinyNet(nn.Module):
    def __init__(self, in_dim=1024, num_classes=4):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, num_classes)
        )
    def forward(self, x):
        return self.net(x)

# -----------------------
# 2) 데이터 로더 (데모)
# -----------------------
def get_fake_batch(bs, in_dim=1024, num_classes=4, device="cpu"):
    x = torch.randn(bs, in_dim, device=device)
    y = torch.randint(0, num_classes, (bs,), device=device)
    return x, y

# -----------------------
# 3) 훈련 루프
# -----------------------
def main():
    cfg = TrainConfig()
    set_seed(cfg.seed)

    # 디렉토리
    ckpt_dir = os.path.join(cfg.output_root, "checkpoints")
    log_dir_train = os.path.join(cfg.output_root, "logs", "train")
    log_dir_val   = os.path.join(cfg.output_root, "logs", "val")
    os.makedirs(ckpt_dir, exist_ok=True)
    os.makedirs(log_dir_train, exist_ok=True)
    os.makedirs(log_dir_val, exist_ok=True)

    # 하이퍼파라미터 스냅샷
    with open(os.path.join(cfg.output_root, "saved_model", "config.yaml"), "w", encoding="utf-8") as f:
        os.makedirs(os.path.join(cfg.output_root, "saved_model"), exist_ok=True)
        yaml.safe_dump(asdict(cfg), f, allow_unicode=True, sort_keys=False)

    writer_tr = SummaryWriter(log_dir_train)
    writer_va = SummaryWriter(log_dir_val)

    # 모델/손실/옵티마이저
    model = TinyNet(in_dim=1024, num_classes=4).to(cfg.device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=cfg.base_lr, weight_decay=cfg.weight_decay)

    # 그래프 등록(선택)
    try:
        dummy = torch.randn(1, 1024).to(cfg.device)
        writer_tr.add_graph(model, dummy)
    except Exception:
        pass  # 일부 환경에서 add_graph가 실패할 수 있음

    # 체크포인트 관리자
    ckptm = CheckpointManager(ckpt_dir, max_to_keep=cfg.ckpt_max_to_keep)

    # 진행 변수
    global_step = 0
    best_metric = -1e9
    save_at = int(cfg.max_steps * cfg.save_fraction)  # ★ x*0.7 지점

    while global_step < cfg.max_steps:
        model.train()
        x, y = get_fake_batch(cfg.batch_size, device=cfg.device)
        logits = model(x)
        loss = criterion(logits, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        global_step += 1

        # 로그
        if global_step % cfg.log_every == 0:
            writer_tr.add_scalar("train/loss", loss.item(), global_step)

        # 간단 검증 (데모)
        if global_step % cfg.eval_every == 0 or global_step == cfg.max_steps:
            model.eval()
            with torch.no_grad():
                # 실제 프로젝트에서는 mAP/IoU 등의 검증 코드를 넣으세요.
                # 여기선 데모로 "val/mAP"를 (무작위에 노이즈 더한) 증가형 수치로 가정
                fake_val_map = 0.2 + 0.8 * (global_step / cfg.max_steps) + (torch.randn(1).item() * 0.01)
                writer_va.add_scalar(cfg.metric_name, fake_val_map, global_step)

                # 베스트 갱신 시 저장
                if fake_val_map > best_metric:
                    best_metric = fake_val_map
                    ckptm.save(
                        step=global_step,
                        metric=fake_val_map,
                        state={
                            "step": global_step,
                            "model": model.state_dict(),
                            "optimizer": optimizer.state_dict(),
                            "best_metric": best_metric,
                            "config": asdict(cfg),
                        },
                        tag="best"
                    )

        # x*0.7 지점 첫 통과 시 강제 저장
        if global_step == save_at:
            ckptm.save(
                step=global_step,
                metric=best_metric if best_metric > -1e9 else 0.0,
                state={
                    "step": global_step,
                    "model": model.state_dict(),
                    "optimizer": optimizer.state_dict(),
                    "best_metric": best_metric,
                    "config": asdict(cfg),
                },
                tag=None
            )

    # 마지막 스냅샷
    ckptm.save(
        step=global_step,
        metric=best_metric if best_metric > -1e9 else 0.0,
        state={
            "step": global_step,
            "model": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "best_metric": best_metric,
            "config": asdict(cfg),
        },
        tag=None
    )

    writer_tr.close(); writer_va.close()
    print(f"[DONE] Training finished at step={global_step}, best={best_metric:.4f}")

if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'utils.checkpoint_manager'

In [17]:
# -*- coding: utf-8 -*-
"""
원하는 스텝의 체크포인트(.pth)를 로드하여
 - TorchScript (script or trace)
 - ONNX
를 D:\AI_SVT_Training_mk\output_inference_graph_pytorch\saved_model\ 에 생성
"""
import os, json, time, glob, re
import torch
import torch.nn as nn

SAVE_ROOT = r"D:\AI_SVT_Training_mk\output_inference_graph_pytorch"
CKPT_DIR  = os.path.join(SAVE_ROOT, "checkpoints")
OUT_DIR   = os.path.join(SAVE_ROOT, "saved_model")

os.makedirs(OUT_DIR, exist_ok=True)

# 학습 때와 동일한 모델 클래스를 가져와야 합니다.
class TinyNet(nn.Module):
    def __init__(self, in_dim=1024, num_classes=4):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, num_classes)
        )
    def forward(self, x):
        return self.net(x)

def find_ckpt_by_step(step: int):
    pat = re.compile(rf"ckpt_step={step:06d}_metric=([0-9.]+)\.pth")
    for f in os.listdir(CKPT_DIR):
        m = pat.match(f)
        if m:
            return os.path.join(CKPT_DIR, f), float(m.group(1))
    return None, None

def load_state(ckpt_path: str):
    obj = torch.load(ckpt_path, map_location="cpu")
    return obj

def export_from_ckpt(step: int = None, use_best: bool = False, onnx: bool = True, torchscript: bool = True):
    if use_best:
        ckpt_path = os.path.join(CKPT_DIR, "best.pth")
        assert os.path.isfile(ckpt_path), "best.pth 가 없습니다."
        metric = None
    else:
        assert step is not None, "step을 지정하거나 use_best=True 하세요."
        ckpt_path, metric = find_ckpt_by_step(step)
        assert ckpt_path is not None, f"해당 스텝({step}) ckpt가 없습니다."

    state = load_state(ckpt_path)
    cfg = state.get("config", {})
    num_classes = 4  # 실제 프로젝트에 맞추세요
    model = TinyNet(in_dim=1024, num_classes=num_classes)
    model.load_state_dict(state["model"])
    model.eval()

    # 더미 입력 (실제 입력 형태로 수정)
    dummy = torch.randn(1, 1024)

    # 1) TorchScript
    if torchscript:
        try:
            scripted = torch.jit.script(model)
        except Exception:
            scripted = torch.jit.trace(model, dummy)
        ts_path = os.path.join(OUT_DIR, "model_ts.pt")
        scripted.save(ts_path)

    # 2) ONNX
    if onnx:
        onnx_path = os.path.join(OUT_DIR, "model.onnx")
        torch.onnx.export(
            model, dummy, onnx_path,
            input_names=["input"], output_names=["logits"],
            opset_version=13, do_constant_folding=True, verbose=False
        )

    # 3) 메타 기록
    meta = {
        "export_time": time.strftime("%Y-%m-%d %H:%M:%S"),
        "by": "export.py",
        "step": int(state.get("step", -1)),
        "metric_name": cfg.get("metric_name", "val/mAP"),
        "best_metric": float(state.get("best_metric", float("nan"))),
        "source_ckpt": os.path.basename(ckpt_path),
        "onnx": bool(onnx),
        "torchscript": bool(torchscript)
    }
    with open(os.path.join(OUT_DIR, "meta.json"), "w", encoding="utf-8") as f:
        json.dump(meta, f, ensure_ascii=False, indent=2)

    print(f"[EXPORTED] {ckpt_path} → saved_model/")

if __name__ == "__main__":
    # 예시) 1) best로 추출
    # export_from_ckpt(use_best=True)

    # 예시) 2) TensorBoard 보고 '6000스텝'이 좋았다면:
    # export_from_ckpt(step=6000, use_best=False)

    export_from_ckpt(use_best=True)


AssertionError: best.pth 가 없습니다.

In [15]:
import utils.checkpoint_manager


ModuleNotFoundError: No module named 'utils.checkpoint_manager'