In [None]:
!pip install ultralytics
!pip install -U datasets

Collecting ultralytics
  Downloading ultralytics-8.3.151-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8.0->ultralytics)
  Downloading n

In [None]:
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.transforms import ToTensor, ToPILImage

In [None]:
from torchvision.transforms import ToPILImage

def fix_channels(t):
    """
    Ensures tensor image has 3 channels (RGB).
    Handles grayscale, grayscale with channel, and RGBA formats.
    Returns a PIL image.
    """
    if len(t.shape) == 2:
        # (H, W) -> (3, H, W)
        return ToPILImage()(t.expand(3, -1, -1))
    if t.shape[0] == 4:
        return ToPILImage()(t[:3])
    if t.shape[0] == 1:
        return ToPILImage()(t.expand(3, -1, -1))
    return ToPILImage()(t)

In [None]:
import random
def xyxy_to_xcycwh(box):
    """
    Boxes in images may have the format (x1, y1, x2, y2) and we may need the format (center of x, center of y, width, height).
    :param box: Tensor-like box with format (x1, y1, x2, y2)
    :return: Tensor-like box with format (center of x, center of y, width, height)
    """
    x1, y1, x2, y2 = box.unbind(dim=1)
    width = x2-x1
    height = y2-y1
    xc = x1 + width*0.5
    yc = y1 + height*0.5
    b = [xc, yc, width, height]
    return torch.stack(b, dim=1)

def cxcywh_to_xyxy(x):
    """
    Boxes in images may have the format (center of x, center of y, width, height) and we may need the format (x1, y1, x2, y2).
    :param box: Tensor-like box with format (center of x, center of y, width, height)
    :return: Tensor-like box with format (x1, y1, x2, y2)
    """
    x_c, y_c, w, h = x.unbind(1)
    x1 = x_c - 0.5 * w
    y1 = y_c - 0.5 * h
    x2 = x_c + 0.5 * w
    y2 = y_c + 0.5 * h
    b = [x1, y1, x2, y2]
    return torch.stack(b, dim=1)

In [None]:
from datasets import load_dataset, ReadInstruction

train_dataset = load_dataset(
    "detection-datasets/fashionpedia",
    split=ReadInstruction("train", from_=0, to=95, unit="%", rounding="pct1_dropremainder")
)
val_dataset = load_dataset(
    "detection-datasets/fashionpedia",
    split=ReadInstruction("train", from_=95, to=100, unit="%", rounding="pct1_dropremainder")
)

README.md:   0%|          | 0.00/5.22k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

(…)-00000-of-00007-fe108070118553c3.parquet:   0%|          | 0.00/482M [00:00<?, ?B/s]

(…)-00001-of-00007-f41a5a9c38c9005b.parquet:   0%|          | 0.00/480M [00:00<?, ?B/s]

(…)-00002-of-00007-40bc8456894bcbcd.parquet:   0%|          | 0.00/480M [00:00<?, ?B/s]

(…)-00003-of-00007-9a99ff8dc572e02c.parquet:   0%|          | 0.00/490M [00:00<?, ?B/s]

(…)-00004-of-00007-f4e6f12cd2cedfea.parquet:   0%|          | 0.00/488M [00:00<?, ?B/s]

(…)-00005-of-00007-41d8dfe1edb6591e.parquet:   0%|          | 0.00/487M [00:00<?, ?B/s]

(…)-00006-of-00007-f41b0f2f4bbefac9.parquet:   0%|          | 0.00/487M [00:00<?, ?B/s]

(…)-00000-of-00001-0b29e85429788213.parquet:   0%|          | 0.00/84.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/45623 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/1158 [00:00<?, ? examples/s]

In [None]:
cats = train_dataset.features['objects'].feature['category']
cats

def idx_to_text(indexes):
    """
    Converts an index into a category label.
    :param indexes: List of indexes
    :return: List of category labels
    """
    labels = []
    for i in indexes:
        labels.append(cats.names[i])
    return labels

In [None]:
from PIL import Image, UnidentifiedImageError

def safe_save_image(image, path, size=(640, 640)):
    try:
        img = image.convert('RGB')
        img = img.resize(size, Image.BILINEAR)
        img.save(path, 'JPEG')
        return True
    except (UnidentifiedImageError, OSError) as e:
        print(f"Corrupt image skipped: {path} ({e})")
        return False

In [None]:
def normalize_and_validate_bbox(bbox, width, height):
    x, y, w, h = bbox
    x_center = (x + w / 2) / width
    y_center = (y + h / 2) / height
    w_norm = w / width
    h_norm = h / height

    # Clip to [0, 1]
    x_center = min(max(x_center, 0), 1)
    y_center = min(max(y_center, 0), 1)
    w_norm = min(max(w_norm, 0), 1)
    h_norm = min(max(h_norm, 0), 1)

    # Only accept boxes with positive area and within bounds
    if 0 < w_norm <= 1 and 0 < h_norm <= 1:
        return x_center, y_center, w_norm, h_norm
    else:
        return None

In [None]:
from tqdm import tqdm
import os

def save_yolo_format(dataset, split='train', base_path='datasets/fashion_yolo', img_size=(640, 640)):
    img_dir = os.path.join(base_path, 'images', split)
    lbl_dir = os.path.join(base_path, 'labels', split)
    os.makedirs(img_dir, exist_ok=True)
    os.makedirs(lbl_dir, exist_ok=True)

    for i, data in enumerate(tqdm(dataset, desc=f"Saving {split} set")):
        image = data['image']
        width, height = image.size
        image_path = os.path.join(img_dir, f'{i}.jpg')
        label_path = os.path.join(lbl_dir, f'{i}.txt')

        # Validate and save image
        if not safe_save_image(image, image_path, size=img_size):
            continue  # Skip this image if it can't be saved

        bboxes = data['objects']['bbox']
        categories = data['objects']['category']
        valid_lines = []
        for bbox, category in zip(bboxes, categories):
            norm = normalize_and_validate_bbox(bbox, width, height)
            if norm:
                x_center, y_center, w_norm, h_norm = norm
                valid_lines.append(f"{category} {x_center:.6f} {y_center:.6f} {w_norm:.6f} {h_norm:.6f}")

        # Write label file (empty if no valid boxes)
        with open(label_path, 'w') as f:
            f.write('\n'.join(valid_lines))

In [None]:
save_yolo_format(train_dataset, split='train')
save_yolo_format(val_dataset, split='val')

Saving train set: 100%|██████████| 43320/43320 [11:28<00:00, 62.91it/s]
Saving val set: 100%|██████████| 2280/2280 [00:36<00:00, 61.70it/s]


In [None]:
import yaml

FASHION_CLASSES = {
    0: "shirt, blouse", 1: "top, t-shirt, sweatshirt", 2: "sweater", 3: "cardigan",
    4: "jacket", 5: "vest", 6: "pants", 7: "shorts", 8: "skirt", 9: "coat", 10: "dress",
    11: "jumpsuit", 12: "cape", 13: "glasses", 14: "hat", 15: "headband, head covering, hair accessory",
    16: "tie", 17: "glove", 18: "watch", 19: "belt", 20: "leg warmer", 21: "tights, stockings",
    22: "sock", 23: "shoe", 24: "bag, wallet", 25: "scarf", 26: "umbrella", 27: "hood",
    28: "collar", 29: "lapel", 30: "epaulette", 31: "sleeve", 32: "pocket", 33: "neckline",
    34: "buckle", 35: "zipper", 36: "applique", 37: "bead", 38: "bow", 39: "flower",
    40: "fringe", 41: "ribbon", 42: "rivet", 43: "ruffle", 44: "sequin", 45: "tassel"
}

# Prepare list of class names in order (0 to 45)
class_names = [FASHION_CLASSES[i] for i in range(len(FASHION_CLASSES))]

import os

base_path = '/content/datasets/fashion_yolo'
data_yaml = {
    'train': os.path.join(base_path, 'images/train'),
    'val': os.path.join(base_path, 'images/val'),
    'nc': len(class_names),
    'names': class_names
}

import yaml
with open(os.path.join(base_path, 'data.yaml'), 'w') as f:
    yaml.dump(data_yaml, f, sort_keys=False)

In [None]:
from ultralytics import YOLO

# Load the base model (YOLOv8 nano here, change if needed)
model = YOLO('/content/best(1).pt')

# Train for 15 epochs
model.train(
    data='/content/datasets/fashion_yolo/data.yaml',
    epochs=15,
    imgsz=640,
    batch=16,
    project='fashion_yolo_project',
    name='yolov8n_fashion',
)

Ultralytics 8.3.151 🚀 Python-3.11.13 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=/content/datasets/fashion_yolo/data.yaml, degrees=0.0, deterministic=True, device=None, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=15, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=/content/best(1).pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=yolov8n_fashion2, nbs=64, nms=False, opset=None, optimize=False, optimizer=auto, overlap_mask=True, patience=100, persp

[34m[1mtrain: [0mScanning /content/datasets/fashion_yolo/labels/train.cache... 43320 images, 0 backgrounds, 0 corrupt: 100%|██████████| 43320/43320 [00:00<?, ?it/s]

[34m[1mtrain: [0m/content/datasets/fashion_yolo/images/train/32525.jpg: 1 duplicate labels removed





[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01, method='weighted_average', num_output_channels=3), CLAHE(p=0.01, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 352.6±66.5 MB/s, size: 44.4 KB)


[34m[1mval: [0mScanning /content/datasets/fashion_yolo/labels/val.cache... 2280 images, 0 backgrounds, 0 corrupt: 100%|██████████| 2280/2280 [00:00<?, ?it/s]


Plotting labels to fashion_yolo_project/yolov8n_fashion2/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m SGD(lr=0.01, momentum=0.9) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.0005), 63 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 2 dataloader workers
Logging results to [1mfashion_yolo_project/yolov8n_fashion2[0m
Starting training for 15 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       1/15       3.3G      1.273      2.214      1.806        120        640: 100%|██████████| 2708/2708 [13:56<00:00,  3.24it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 72/72 [00:24<00:00,  2.96it/s]


                   all       2280      16077      0.501       0.37      0.312      0.232

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       2/15      3.53G      1.159      2.067      1.703        112        640: 100%|██████████| 2708/2708 [13:26<00:00,  3.36it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 72/72 [00:22<00:00,  3.23it/s]


                   all       2280      16077       0.46      0.341      0.283      0.206

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       3/15      3.53G      1.176      2.103      1.718        161        640: 100%|██████████| 2708/2708 [13:13<00:00,  3.41it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 72/72 [00:21<00:00,  3.28it/s]


                   all       2280      16077      0.447      0.319      0.232      0.162

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       4/15      3.53G      1.188      2.136      1.729        137        640: 100%|██████████| 2708/2708 [12:56<00:00,  3.49it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 72/72 [00:22<00:00,  3.21it/s]


                   all       2280      16077      0.488      0.332      0.274      0.195

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       5/15      3.76G      1.166      2.104       1.71        147        640: 100%|██████████| 2708/2708 [12:55<00:00,  3.49it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 72/72 [00:22<00:00,  3.16it/s]


                   all       2280      16077      0.461      0.355      0.272      0.192
Closing dataloader mosaic
[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01, method='weighted_average', num_output_channels=3), CLAHE(p=0.01, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       6/15      3.77G     0.9629      1.649      1.615         53        640: 100%|██████████| 2708/2708 [12:17<00:00,  3.67it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 72/72 [00:21<00:00,  3.35it/s]


                   all       2280      16077      0.484      0.363      0.295      0.224

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       7/15      3.77G     0.9308      1.587      1.585         56        640: 100%|██████████| 2708/2708 [12:22<00:00,  3.65it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 72/72 [00:21<00:00,  3.29it/s]


                   all       2280      16077      0.493      0.357        0.3      0.228

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       8/15      3.77G     0.9075      1.544      1.563         38        640: 100%|██████████| 2708/2708 [12:11<00:00,  3.70it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 72/72 [00:21<00:00,  3.33it/s]


                   all       2280      16077      0.483      0.373       0.32      0.247

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       9/15      3.77G      0.889      1.509      1.546         35        640: 100%|██████████| 2708/2708 [12:05<00:00,  3.73it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 72/72 [00:20<00:00,  3.45it/s]


                   all       2280      16077       0.48       0.38       0.33      0.255

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      10/15      3.77G     0.8764      1.481      1.533         55        640: 100%|██████████| 2708/2708 [12:10<00:00,  3.71it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 72/72 [00:20<00:00,  3.46it/s]


                   all       2280      16077      0.476       0.39      0.333      0.261

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      11/15      3.77G     0.8567      1.448      1.515         59        640: 100%|██████████| 2708/2708 [12:13<00:00,  3.69it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 72/72 [00:21<00:00,  3.32it/s]


                   all       2280      16077      0.447      0.405      0.334      0.263

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      12/15      3.77G     0.8394      1.418      1.499         40        640: 100%|██████████| 2708/2708 [12:16<00:00,  3.68it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 72/72 [00:22<00:00,  3.24it/s]


                   all       2280      16077      0.477      0.405      0.351      0.276

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      13/15      3.77G     0.8232      1.393      1.485         41        640: 100%|██████████| 2708/2708 [12:23<00:00,  3.64it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 72/72 [00:22<00:00,  3.19it/s]


                   all       2280      16077      0.476      0.395      0.352      0.279

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      14/15      3.77G     0.8095      1.367      1.472         50        640: 100%|██████████| 2708/2708 [12:16<00:00,  3.68it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 72/72 [00:20<00:00,  3.48it/s]


                   all       2280      16077      0.489      0.394      0.356      0.283

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      15/15      3.77G     0.7951      1.341       1.46         67        640: 100%|██████████| 2708/2708 [12:14<00:00,  3.69it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 72/72 [00:22<00:00,  3.25it/s]


                   all       2280      16077      0.477      0.414      0.356      0.283

15 epochs completed in 3.246 hours.
Optimizer stripped from fashion_yolo_project/yolov8n_fashion2/weights/last.pt, 6.3MB
Optimizer stripped from fashion_yolo_project/yolov8n_fashion2/weights/best.pt, 6.3MB

Validating fashion_yolo_project/yolov8n_fashion2/weights/best.pt...
Ultralytics 8.3.151 🚀 Python-3.11.13 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)
Model summary (fused): 72 layers, 3,014,618 parameters, 0 gradients, 8.1 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 72/72 [00:26<00:00,  2.71it/s]


                   all       2280      16077      0.489      0.394      0.356      0.283
         shirt, blouse        395        397       0.53      0.713      0.673      0.433
top, t-shirt, sweatshirt        742        759      0.498      0.738       0.67      0.536
               sweater         96         98      0.435      0.429      0.374      0.314
              cardigan         64         64      0.337      0.234      0.258      0.226
                jacket        416        420      0.585      0.862      0.838      0.733
                  vest         34         34      0.423     0.0882      0.161      0.106
                 pants        652        652      0.517      0.902      0.804      0.731
                shorts         81         81      0.437      0.802      0.556      0.453
                 skirt        179        179      0.389      0.682      0.459      0.403
                  coat        143        144      0.406      0.604      0.505      0.416
                 dr

ultralytics.utils.metrics.DetMetrics object with attributes:

ap_class_index: array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45])
box: ultralytics.utils.metrics.Metric object
confusion_matrix: <ultralytics.utils.metrics.ConfusionMatrix object at 0x7e48e2d25910>
curves: ['Precision-Recall(B)', 'F1-Confidence(B)', 'Precision-Confidence(B)', 'Recall-Confidence(B)']
curves_results: [[array([          0,    0.001001,    0.002002,    0.003003,    0.004004,    0.005005,    0.006006,    0.007007,    0.008008,    0.009009,     0.01001,    0.011011,    0.012012,    0.013013,    0.014014,    0.015015,    0.016016,    0.017017,    0.018018,    0.019019,     0.02002,    0.021021,    0.022022,    0.023023,
          0.024024,    0.025025,    0.026026,    0.027027,    0.028028,    0.029029,     0.03003,    0.031031,    0.032032,    0.033033,    0.034034,    0.035035,

In [None]:
from google.colab import files
files.download('/content/fashion_yolo_project/yolov8n_fashion2/weights/best.pt')
files.download('/content/fashion_yolo_project/yolov8n_fashion2/weights/last.pt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>