In [None]:
!pip install -q ultralytics

## Yolo compression

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.datasets as datasets
from torchvision import models
from torch.nn.utils import prune
from torch.utils.data import DataLoader
from ultralytics import YOLO

import time
from tqdm.notebook import tqdm

In [None]:
model = YOLO('yolov8l.pt')

In [None]:
param_size = 0
for param in model.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('model size: {:.3f}MB'.format(size_all_mb))

model size: 166.848MB


In [None]:
model.eval()
!yolo task=detect mode=val model=yolov8l.pt name=yolov8l_eval

Ultralytics 8.3.97 🚀 Python-3.11.11 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)
YOLOv8l summary (fused): 112 layers, 43,668,288 parameters, 0 gradients, 165.2 GFLOPs
[34m[1mval: [0mScanning /content/datasets/coco8/labels/val.cache... 4 images, 0 backgrounds, 0 corrupt: 100% 4/4 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100% 1/1 [00:00<00:00,  1.46it/s]
                   all          4         17      0.914      0.839      0.976       0.76
                person          3         10          1       0.42       0.88      0.574
                   dog          1          1      0.869          1      0.995      0.895
                 horse          1          2      0.902          1      0.995      0.681
              elephant          1          2          1      0.612      0.995      0.619
              umbrella          1          1      0.822          1      0.995      0.995
          potted plant          1        

In [None]:
inp = torch.randn(1, 3, 224, 224)

num_samples = 1000
start_time = time.time()
for _ in tqdm(range(num_samples)):
    output = model((inp / 255), verbose=False)
end_time = time.time()

infer_time = ((end_time - start_time) / num_samples) * 1000
print("---"*10, f'Avg inference time: {infer_time:.4f} ms', sep='\n')

  0%|          | 0/1000 [00:00<?, ?it/s]

------------------------------
Avg inference time: 13.7541 ms


### Квантизация

In [None]:
model = YOLO('yolov8l.pt')

In [None]:
quantized_model = torch.quantization.quantize_dynamic(
    model,
    {torch.nn.Conv2d},
    dtype=torch.qint8
)

torch.save(quantized_model.state_dict(), 'quantized.pt')

In [None]:
param_size = 0
for param in quantized_model.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in quantized_model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('quantized_model size: {:.3f}MB'.format(size_all_mb))

quantized_model size: 166.848MB


In [None]:
inp = torch.randn(1, 3, 224, 224)

num_samples = 1000
start_time = time.time()
for _ in tqdm(range(num_samples)):
    output = quantized_model((inp / 255), verbose=False)
end_time = time.time()

infer_time = ((end_time - start_time) / num_samples) * 1000
print("---"*10, f'Avg inference time: {infer_time:.4f} ms', sep='\n')

  0%|          | 0/1000 [00:00<?, ?it/s]

------------------------------
Avg inference time: 13.3719 ms


### Прунинг

In [None]:
model = YOLO('yolov8l.pt')

In [None]:
for name, m in model.named_modules():
  params = list(m.named_parameters())
  if len(params) and params[0][0] == 'weight':
    prune.l1_unstructured(m, name=params[0][0], amount=0.3)

In [None]:
param_size = 0
for param in model.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('model size: {:.3f}MB'.format(size_all_mb))

model size: 333.428MB


In [None]:
inp = torch.randn(1, 3, 224, 224)

num_samples = 1000
start_time = time.time()
for _ in tqdm(range(num_samples)):
    output = model((inp / 255), verbose=False)
end_time = time.time()

infer_time = ((end_time - start_time) / num_samples) * 1000
print("---"*10, f'Avg inference time: {infer_time:.4f} ms', sep='\n')

  0%|          | 0/1000 [00:00<?, ?it/s]

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!