In [1]:
!jupyter nbextension enable --py widgetsnbextension
!pip install ultralytics

Enabling notebook extension jupyter-js-widgets/extension...
Paths used for configuration of notebook: 
    	/root/.jupyter/nbconfig/notebook.json
Paths used for configuration of notebook: 
    	
      - Validating: [32mOK[0m
Paths used for configuration of notebook: 
    	/root/.jupyter/nbconfig/notebook.json
Collecting ultralytics
  Downloading ultralytics-8.3.137-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-1

# Новый раздел

In [2]:
import os
from torchvision.datasets import VOCDetection
from tqdm.notebook import tqdm
from PIL import Image
import os
import random

Пути для YOLO-совместимого датасета

In [3]:
YOLO_DATASET_PATH = "yolo_voc_dataset"
IMAGES_DIR = os.path.join(YOLO_DATASET_PATH, "images", "train")
LABELS_DIR = os.path.join(YOLO_DATASET_PATH, "labels", "train")
os.makedirs(IMAGES_DIR, exist_ok=True)
os.makedirs(LABELS_DIR, exist_ok=True)

Классы VOC

In [4]:
VOC_CLASSES = [
    "aeroplane", "bicycle", "bird", "boat", "bottle",
    "bus", "car", "cat", "chair", "cow", "diningtable",
    "dog", "horse", "motorbike", "person", "pottedplant",
    "sheep", "sofa", "train", "tvmonitor"
]

Загрузка VOC 2007 (train set)

In [5]:
voc_dataset = VOCDetection(root=".", year="2007", image_set="trainval", download=True)

100%|██████████| 460M/460M [00:19<00:00, 23.6MB/s]


Конвертация Pascal VOC → YOLO формат

In [6]:
def convert_bbox(size, box):
    dw = 1.0 / size[0]
    dh = 1.0 / size[1]
    x = (box[0] + box[1]) / 2.0 * dw
    y = (box[2] + box[3]) / 2.0 * dh
    w = (box[1] - box[0]) * dw
    h = (box[3] - box[2]) * dh
    return (x, y, w, h)

In [7]:
for i in tqdm(range(len(voc_dataset))):
    img, target = voc_dataset[i]
    img_id = target["annotation"]["filename"]
    width = int(target["annotation"]["size"]["width"])
    height = int(target["annotation"]["size"]["height"])
    objects = target["annotation"]["object"]
    img_save_path = os.path.join(IMAGES_DIR, img_id)
    img.save(img_save_path)

    label_save_path = os.path.join(LABELS_DIR, img_id.replace(".jpg", ".txt"))
    with open(label_save_path, "w") as f:
        if not isinstance(objects, list):
            objects = [objects]
        for obj in objects:
            cls = obj["name"]
            if cls not in VOC_CLASSES:
                continue
            cls_id = VOC_CLASSES.index(cls)
            bbox = obj["bndbox"]
            b = (
                float(bbox["xmin"]),
                float(bbox["xmax"]),
                float(bbox["ymin"]),
                float(bbox["ymax"]),
            )
            bb = convert_bbox((width, height), b)
            f.write(f"{cls_id} {' '.join(map(str, bb))}\n")

  0%|          | 0/5011 [00:00<?, ?it/s]

Создание YAML-конфигурации для обучения

In [8]:
voc_yaml = f"""
path: {YOLO_DATASET_PATH}
train: images/train
val: images/train

names:
"""
for i, cls in enumerate(VOC_CLASSES):
    voc_yaml += f"  {i}: {cls}\n"

with open("voc.yaml", "w") as f:
    f.write(voc_yaml)

Запуск обучения YOLOv8

In [9]:
from ultralytics import YOLO

model = YOLO('yolov8n.pt')
model.train(
    data="voc.yaml",
    epochs=3,
    imgsz=640,
    batch=4,
)

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'yolov8n.pt'...


100%|██████████| 6.25M/6.25M [00:00<00:00, 100MB/s]


Ultralytics 8.3.137 🚀 Python-3.11.12 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=4, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=voc.yaml, degrees=0.0, deterministic=True, device=None, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=3, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolov8n.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=train, nbs=64, nms=False, opset=None, optimize=False, optimizer=auto, overlap_mask=True, patience=100, perspective=0.0, plots=True, pose=12.0, pretrained=True, pr

100%|██████████| 755k/755k [00:00<00:00, 23.0MB/s]

Overriding model.yaml nc=80 with nc=20

                   from  n    params  module                                       arguments                     
  0                  -1  1       464  ultralytics.nn.modules.conv.Conv             [3, 16, 3, 2]                 
  1                  -1  1      4672  ultralytics.nn.modules.conv.Conv             [16, 32, 3, 2]                
  2                  -1  1      7360  ultralytics.nn.modules.block.C2f             [32, 32, 1, True]             
  3                  -1  1     18560  ultralytics.nn.modules.conv.Conv             [32, 64, 3, 2]                
  4                  -1  2     49664  ultralytics.nn.modules.block.C2f             [64, 64, 2, True]             
  5                  -1  1     73984  ultralytics.nn.modules.conv.Conv             [64, 128, 3, 2]               
  6                  -1  2    197632  ultralytics.nn.modules.block.C2f             [128, 128, 2, True]           
  7                  -1  1    295424  ultralytic




Model summary: 129 layers, 3,014,748 parameters, 3,014,732 gradients, 8.2 GFLOPs

Transferred 319/355 items from pretrained weights
Freezing layer 'model.22.dfl.conv.weight'
[34m[1mAMP: [0mrunning Automatic Mixed Precision (AMP) checks...
Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n.pt to 'yolo11n.pt'...


100%|██████████| 5.35M/5.35M [00:00<00:00, 113MB/s]


[34m[1mAMP: [0mchecks passed ✅
[34m[1mtrain: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 1452.8±743.2 MB/s, size: 37.9 KB)


[34m[1mtrain: [0mScanning /content/yolo_voc_dataset/labels/train... 5011 images, 0 backgrounds, 0 corrupt: 100%|██████████| 5011/5011 [00:02<00:00, 2463.75it/s]

[34m[1mtrain: [0mNew cache created: /content/yolo_voc_dataset/labels/train.cache





[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01, method='weighted_average', num_output_channels=3), CLAHE(p=0.01, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 676.6±324.4 MB/s, size: 27.2 KB)


[34m[1mval: [0mScanning /content/yolo_voc_dataset/labels/train.cache... 5011 images, 0 backgrounds, 0 corrupt: 100%|██████████| 5011/5011 [00:00<?, ?it/s]


Plotting labels to runs/detect/train/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.000417, momentum=0.9) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.0005), 63 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 2 dataloader workers
Logging results to [1mruns/detect/train[0m
Starting training for 3 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        1/3     0.637G      1.164      2.904      1.365         24        640: 100%|██████████| 1253/1253 [02:18<00:00,  9.05it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 627/627 [00:48<00:00, 12.92it/s]


                   all       5011      15662      0.589      0.582      0.588      0.388

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        2/3     0.838G      1.183      2.157      1.383         16        640: 100%|██████████| 1253/1253 [02:11<00:00,  9.54it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 627/627 [00:45<00:00, 13.84it/s]


                   all       5011      15662      0.644      0.598      0.636      0.429

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        3/3      0.84G       1.14      1.971       1.35         16        640: 100%|██████████| 1253/1253 [02:09<00:00,  9.67it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 627/627 [00:44<00:00, 14.23it/s]


                   all       5011      15662      0.677      0.637      0.681      0.473

3 epochs completed in 0.150 hours.
Optimizer stripped from runs/detect/train/weights/last.pt, 6.2MB
Optimizer stripped from runs/detect/train/weights/best.pt, 6.2MB

Validating runs/detect/train/weights/best.pt...
Ultralytics 8.3.137 🚀 Python-3.11.12 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)
Model summary (fused): 72 layers, 3,009,548 parameters, 0 gradients, 8.1 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 627/627 [00:41<00:00, 15.08it/s]


                   all       5011      15662      0.677      0.638      0.681      0.473
             aeroplane        240        331      0.746      0.782      0.815      0.569
               bicycle        255        418      0.654      0.702      0.707      0.473
                  bird        333        599       0.45      0.651      0.551       0.36
                  boat        188        398      0.407      0.599      0.467      0.298
                bottle        262        634      0.588      0.486       0.48      0.296
                   bus        197        272      0.619      0.688      0.726      0.598
                   car        761       1644      0.878      0.729      0.854      0.624
                   cat        344        389      0.778      0.729      0.771      0.566
                 chair        572       1432      0.729      0.407      0.548      0.358
                   cow        146        356      0.697      0.596      0.713      0.503
           diningtabl

ultralytics.utils.metrics.DetMetrics object with attributes:

ap_class_index: array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])
box: ultralytics.utils.metrics.Metric object
confusion_matrix: <ultralytics.utils.metrics.ConfusionMatrix object at 0x784c5e048310>
curves: ['Precision-Recall(B)', 'F1-Confidence(B)', 'Precision-Confidence(B)', 'Recall-Confidence(B)']
curves_results: [[array([          0,    0.001001,    0.002002,    0.003003,    0.004004,    0.005005,    0.006006,    0.007007,    0.008008,    0.009009,     0.01001,    0.011011,    0.012012,    0.013013,    0.014014,    0.015015,    0.016016,    0.017017,    0.018018,    0.019019,     0.02002,    0.021021,    0.022022,    0.023023,
          0.024024,    0.025025,    0.026026,    0.027027,    0.028028,    0.029029,     0.03003,    0.031031,    0.032032,    0.033033,    0.034034,    0.035035,    0.036036,    0.037037,    0.038038,    0.039039,     0.04004,    0.041041,    0.042042,    0.043

Выведем метрики

In [14]:
results = model.val(data="voc.yaml", split="val")
metrics = results.results_dict

Ultralytics 8.3.137 🚀 Python-3.11.12 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 1266.3±489.2 MB/s, size: 35.8 KB)


[34m[1mval: [0mScanning /content/yolo_voc_dataset/labels/train.cache... 5011 images, 0 backgrounds, 0 corrupt: 100%|██████████| 5011/5011 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 1253/1253 [00:45<00:00, 27.46it/s]


                   all       5011      15662      0.677      0.638      0.682      0.473
             aeroplane        240        331      0.743      0.782      0.816      0.569
               bicycle        255        418      0.659      0.702      0.709      0.474
                  bird        333        599      0.449      0.653       0.55      0.357
                  boat        188        398      0.407      0.601      0.468      0.298
                bottle        262        634      0.593      0.486      0.486      0.301
                   bus        197        272      0.624      0.688      0.728      0.599
                   car        761       1644      0.877      0.728      0.854      0.625
                   cat        344        389       0.78      0.728      0.771      0.565
                 chair        572       1432      0.728      0.406      0.548      0.358
                   cow        146        356      0.696      0.601      0.713      0.501
           diningtabl

In [15]:
print("Precision   :", metrics['metrics/precision(B)'])
print("Recall      :", metrics['metrics/recall(B)'])
print("mAP@0.5     :", metrics['metrics/mAP50(B)'])
print("mAP@0.5:0.95:", metrics['metrics/mAP50-95(B)'])

Precision   : 0.6774948915621928
Recall      : 0.6377754865369493
mAP@0.5     : 0.6817597175660972
mAP@0.5:0.95: 0.47298864094034526


Улучшение бейзлайна

In [16]:
YOLO_DATASET_PATH = "yolo_voc_dataset"
IMAGES_DIR = os.path.join(YOLO_DATASET_PATH, "images")
LABELS_DIR = os.path.join(YOLO_DATASET_PATH, "labels")
TRAIN_IMG = os.path.join(IMAGES_DIR, "train")
VAL_IMG = os.path.join(IMAGES_DIR, "val")
TRAIN_LAB = os.path.join(LABELS_DIR, "train")
VAL_LAB = os.path.join(LABELS_DIR, "val")

for d in [TRAIN_IMG, VAL_IMG, TRAIN_LAB, VAL_LAB]:
    os.makedirs(d, exist_ok=True)

Загрузка и рандомный сплит

In [17]:
voc_dataset = VOCDetection(root=".", year="2007", image_set="trainval", download=True)
indices = list(range(len(voc_dataset)))
random.shuffle(indices)
split = int(0.85 * len(indices))
train_idx, val_idx = indices[:split], indices[split:]

In [18]:
def process_and_save(idx, img_dir, lab_dir):
    img, target = voc_dataset[idx]
    img_id = target["annotation"]["filename"]
    width = int(target["annotation"]["size"]["width"])
    height = int(target["annotation"]["size"]["height"])
    objects = target["annotation"]["object"]
    img_save_path = os.path.join(img_dir, img_id)
    img.save(img_save_path)
    label_save_path = os.path.join(lab_dir, img_id.replace(".jpg", ".txt"))
    with open(label_save_path, "w") as f:
        if not isinstance(objects, list):
            objects = [objects]
        for obj in objects:
            cls = obj["name"]
            if cls not in VOC_CLASSES:
                continue
            cls_id = VOC_CLASSES.index(cls)
            bbox = obj["bndbox"]
            b = (
                float(bbox["xmin"]),
                float(bbox["xmax"]),
                float(bbox["ymin"]),
                float(bbox["ymax"]),
            )
            bb = convert_bbox((width, height), b)
            f.write(f"{cls_id} {' '.join(map(str, bb))}\n")

for idx in tqdm(train_idx, desc="Train split conversion"):
    process_and_save(idx, TRAIN_IMG, TRAIN_LAB)

for idx in tqdm(val_idx, desc="Val split conversion"):
    process_and_save(idx, VAL_IMG, VAL_LAB)

Train split conversion:   0%|          | 0/4259 [00:00<?, ?it/s]

Val split conversion:   0%|          | 0/752 [00:00<?, ?it/s]

YAML-конфиг с двумя выборками

In [19]:
voc_yaml = f"""
path: {YOLO_DATASET_PATH}
train: images/train
val: images/val

names:
"""
for i, cls in enumerate(VOC_CLASSES):
    voc_yaml += f"  {i}: {cls}\n"

with open("voc.yaml", "w") as f:
    f.write(voc_yaml)

In [20]:
model = YOLO('yolov8n.pt')

model.train(
    data="voc.yaml",
    epochs=3,
    imgsz=640,
    batch=4,
    lr0=0.001,
    patience=10,
    optimizer='Adam',
    hsv_h=0.1,
    hsv_s=0.7,
    hsv_v=0.4,
    flipud=0.5,
    fliplr=0.5,
    mosaic=1.0,
    mixup=0.2,
    degrees=5.0,
    translate=0.1,
    scale=0.5,
    shear=2.0,
)

Ultralytics 8.3.137 🚀 Python-3.11.12 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=4, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=voc.yaml, degrees=5.0, deterministic=True, device=None, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=3, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.5, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.1, hsv_s=0.7, hsv_v=0.4, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.001, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.2, mode=train, model=yolov8n.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=train7, nbs=64, nms=False, opset=None, optimize=False, optimizer=Adam, overlap_mask=True, patience=10, perspective=0.0, plots=True, pose=12.0, pretrained=True, pro

[34m[1mtrain: [0mScanning /content/yolo_voc_dataset/labels/train.cache... 5011 images, 0 backgrounds, 0 corrupt: 100%|██████████| 5011/5011 [00:00<?, ?it/s]

[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01, method='weighted_average', num_output_channels=3), CLAHE(p=0.01, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))





[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 395.5±137.0 MB/s, size: 33.6 KB)


[34m[1mval: [0mScanning /content/yolo_voc_dataset/labels/val... 752 images, 0 backgrounds, 0 corrupt: 100%|██████████| 752/752 [00:00<00:00, 964.22it/s] 

[34m[1mval: [0mNew cache created: /content/yolo_voc_dataset/labels/val.cache





Plotting labels to runs/detect/train7/labels.jpg... 
[34m[1moptimizer:[0m Adam(lr=0.001, momentum=0.937) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.0005), 63 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 2 dataloader workers
Logging results to [1mruns/detect/train7[0m
Starting training for 3 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        1/3     0.736G       1.65       3.38      1.817         45        640: 100%|██████████| 1253/1253 [02:25<00:00,  8.59it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 94/94 [00:06<00:00, 13.58it/s]


                   all        752       2385      0.331       0.22      0.153     0.0742

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        2/3     0.846G      1.611      2.938      1.802         17        640: 100%|██████████| 1253/1253 [02:17<00:00,  9.08it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 94/94 [00:07<00:00, 12.67it/s]


                   all        752       2385       0.35      0.365      0.284      0.152

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        3/3     0.861G      1.543      2.681      1.739         12        640: 100%|██████████| 1253/1253 [02:17<00:00,  9.11it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 94/94 [00:06<00:00, 13.50it/s]


                   all        752       2385      0.471      0.393      0.422      0.242

3 epochs completed in 0.123 hours.
Optimizer stripped from runs/detect/train7/weights/last.pt, 6.2MB
Optimizer stripped from runs/detect/train7/weights/best.pt, 6.2MB

Validating runs/detect/train7/weights/best.pt...
Ultralytics 8.3.137 🚀 Python-3.11.12 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)
Model summary (fused): 72 layers, 3,009,548 parameters, 0 gradients, 8.1 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 94/94 [00:07<00:00, 12.31it/s]


                   all        752       2385      0.471      0.397      0.422      0.242
             aeroplane         41         66      0.399      0.606      0.562      0.342
               bicycle         46         73      0.289      0.644      0.428      0.246
                  bird         34         53      0.657      0.218      0.408      0.215
                  boat         22         57      0.237     0.0175     0.0985     0.0539
                bottle         39         68      0.608     0.0913      0.226      0.109
                   bus         30         40      0.454      0.425      0.402      0.264
                   car        119        271      0.499      0.664      0.626      0.386
                   cat         49         56      0.599      0.571      0.607      0.387
                 chair         85        231      0.422      0.303      0.282      0.149
                   cow         23         69      0.675       0.24      0.429       0.22
           diningtabl

ultralytics.utils.metrics.DetMetrics object with attributes:

ap_class_index: array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])
box: ultralytics.utils.metrics.Metric object
confusion_matrix: <ultralytics.utils.metrics.ConfusionMatrix object at 0x784c26284250>
curves: ['Precision-Recall(B)', 'F1-Confidence(B)', 'Precision-Confidence(B)', 'Recall-Confidence(B)']
curves_results: [[array([          0,    0.001001,    0.002002,    0.003003,    0.004004,    0.005005,    0.006006,    0.007007,    0.008008,    0.009009,     0.01001,    0.011011,    0.012012,    0.013013,    0.014014,    0.015015,    0.016016,    0.017017,    0.018018,    0.019019,     0.02002,    0.021021,    0.022022,    0.023023,
          0.024024,    0.025025,    0.026026,    0.027027,    0.028028,    0.029029,     0.03003,    0.031031,    0.032032,    0.033033,    0.034034,    0.035035,    0.036036,    0.037037,    0.038038,    0.039039,     0.04004,    0.041041,    0.042042,    0.043

Вывежем метрики

In [21]:
results = model.val(data="voc.yaml", split="val")
metrics = results.results_dict

Ultralytics 8.3.137 🚀 Python-3.11.12 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)
Model summary (fused): 72 layers, 3,009,548 parameters, 0 gradients, 8.1 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 1003.5±544.8 MB/s, size: 34.5 KB)


[34m[1mval: [0mScanning /content/yolo_voc_dataset/labels/val.cache... 752 images, 0 backgrounds, 0 corrupt: 100%|██████████| 752/752 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 188/188 [00:08<00:00, 23.14it/s]


                   all        752       2385      0.467      0.394       0.42      0.241
             aeroplane         41         66      0.396      0.587      0.566      0.346
               bicycle         46         73      0.288      0.644      0.428      0.244
                  bird         34         53      0.656      0.216      0.404      0.215
                  boat         22         57      0.239     0.0175     0.0991     0.0542
                bottle         39         68      0.537     0.0882      0.207     0.0989
                   bus         30         40      0.455      0.425      0.402      0.264
                   car        119        271      0.496      0.668      0.627      0.387
                   cat         49         56      0.625      0.566      0.613      0.389
                 chair         85        231       0.43      0.303      0.283       0.15
                   cow         23         69      0.644      0.236      0.433      0.223
           diningtabl

In [22]:
print("Precision   :", metrics['metrics/precision(B)'])
print("Recall      :", metrics['metrics/recall(B)'])
print("mAP@0.5     :", metrics['metrics/mAP50(B)'])
print("mAP@0.5:0.95:", metrics['metrics/mAP50-95(B)'])

Precision   : 0.4674014434193749
Recall      : 0.3937320295781447
mAP@0.5     : 0.42041951219679746
mAP@0.5:0.95: 0.2411396948722522


Имплементация модели

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CBL(nn.Module):
    def __init__(self, in_channels, out_channels, k=3, s=1, p=1):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, k, s, p, bias=False)
        self.bn = nn.BatchNorm2d(out_channels)
        self.act = nn.LeakyReLU(0.1, inplace=True)
    def forward(self, x):
        return self.act(self.bn(self.conv(x)))

class YOLOv11(nn.Module):
    def __init__(self, num_classes=20, anchors=3):
        super().__init__()
        self.backbone = nn.Sequential(
            CBL(3, 32, 3, 1),      # 416x416 -> 416x416
            CBL(32, 64, 3, 2),     # 416x416 -> 208x208
            CBL(64, 128, 3, 2),    # 208x208 -> 104x104
            CBL(128, 256, 3, 2),   # 104x104 -> 52x52
            CBL(256, 512, 3, 2),   # 52x52 -> 26x26
            nn.MaxPool2d(5, 1, 2),
            nn.MaxPool2d(9, 1, 4),
            nn.MaxPool2d(13, 1, 6),
        )
        self.head = nn.Sequential(
            CBL(512, 256, 1, 1),
            CBL(256, 128, 1, 1),
        )
        self.pred = nn.Conv2d(128, anchors * (num_classes + 5), 1, 1, 0)

    def forward(self, x):
        x = self.backbone(x)
        x = self.head(x)
        x = self.pred(x)

        b, c, h, w = x.shape
        x = x.view(b, -1, self.num_classes+5, h, w)
        return x

    @property
    def num_classes(self):
        return self.pred.out_channels // 3 - 5


Обучим кастомную модель

In [24]:
model = YOLOv11(num_classes=20)
dummy = torch.randn(1, 3, 416, 416)
out = model(dummy)