In [1]:
!pip install gputil

Collecting gputil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-py3-none-any.whl size=7392 sha256=5c0b33383dca4051c4256ea902c64f83be455cdcd15b4caeb57d36dc63f50b70
  Stored in directory: /root/.cache/pip/wheels/2b/4d/8f/55fb4f7b9b591891e8d3f72977c4ec6c7763b39c19f0861595
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0


In [2]:
import os
import time

import psutil
import GPUtil
import kagglehub
import numpy as np
from tqdm import tqdm

import timm
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform

import torch
from torch import nn, optim
from torch.amp import autocast
from torchvision import datasets, transforms
from torch.utils.data import random_split, DataLoader
from torch.nn.utils import prune

from sklearn.metrics import precision_score, recall_score, f1_score

In [3]:
def print_memory_usage(label=""):
    """Выводит использование памяти в мегабайтах"""
    if label:
        print(f"\n--- Memory Usage ({label}) ---")
    else:
        print("\n--- Memory Usage ---")
        
    # CPU RAM в MB
    process = psutil.Process(os.getpid())
    ram_used = process.memory_info().rss / (1024 ** 2)
    print(f"CPU RAM used: {ram_used:.2f} MB")
    
    # GPU VRAM в MB
    gpus = GPUtil.getGPUs()
    for gpu in gpus:
        vram_used = gpu.memoryUsed
        vram_total = gpu.memoryTotal
        print(f"GPU {gpu.id} VRAM: {vram_used:.2f} MB / {vram_total:.2f} MB")


def get_model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    return (param_size + buffer_size) / 1024**2


def calculate_metrics(model, device, data_loader):
    model.eval()
    all_preds = []
    all_targets = []

    model.to(device)
    with torch.no_grad():
        for images, targets in data_loader:
            images = images.to(device)
            outputs = model(images)
            preds = outputs.argmax(dim=1).cpu().numpy()
            
            all_preds.extend(preds)
            all_targets.extend(targets.numpy())
    
    precision = precision_score(all_targets, all_preds, average='macro')
    recall = recall_score(all_targets, all_preds, average='macro')
    f1 = f1_score(all_targets, all_preds, average='macro')
    
    return precision, recall, f1


def load_imagenet_mini(dataset_path, model, split):
    config = resolve_data_config({}, model=model)
    transform = create_transform(**config)

    print(transform)

    if split == 'train':
        transform = transforms.Compose([
            transforms.ColorJitter(
                brightness=0.2,
                contrast=0.2,
                saturation=0.2,
                hue=0.2,
            ),
            transforms.GaussianBlur(
                kernel_size=3,
            ),
            transform,
        ])

    dataset = datasets.ImageFolder(
        root=os.path.join(dataset_path, split),
        transform=transform
    )
    
    return dataset


def benchmark_model(model, device, input_tensor, num_runs=10, warmup=3, use_amp=False):
    model = model.to(device)
    input_tensor = input_tensor.to(device)
    
    # Warmup
    print(f"\n🔥 Warming up ({warmup} runs) on {device}...")
    for _ in range(warmup):
        with torch.no_grad():
            if use_amp and device.type == 'cuda':
                with autocast(device_type='cuda', dtype=torch.float16):
                    _ = model(input_tensor)
            else:
                _ = model(input_tensor)
    
    # Benchmark
    print(f"🚀 Benchmarking ({num_runs} runs) on {device}...")
    start_time = time.time()
    
    for _ in range(num_runs):
        with torch.no_grad():
            if use_amp and device.type == 'cuda':
                with autocast(device_type='cuda', dtype=torch.float16):
                    _ = model(input_tensor)
            else:
                _ = model(input_tensor)
    
    total_time = (time.time() - start_time) * 1000
    avg_time = total_time / num_runs
    print(f"✅ Average inference: {avg_time:.2f} ms")
    print(f"📊 Total time: {total_time:.2f} ms | FPS: {1000/(avg_time + 1e-9):.1f}")
    
    return avg_time


def main(model, dataset_path=None):
    device_cpu = torch.device('cpu')
    device_gpu = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    print("\n🔍 Initial memory state:")
    print_memory_usage("Before loading model")
    model.eval()
    print(f"📏 Model size: {get_model_size(model):.2f} MB")

    print("\n🔍 Initial memory state:")
    print_memory_usage("Model loaded")
    
    # Бенчмарки
    input_tensor = torch.randn(1, 3, 224, 224)
    
    print("\n🧪 Benchmarking on CPU:")
    cpu_time = benchmark_model(model, device_cpu, input_tensor)
    print_memory_usage("After CPU benchmark")
    
    if torch.cuda.is_available():
        print("\n🎮 Benchmarking on GPU:")
        gpu_time = benchmark_model(model, device_gpu, input_tensor)
        print_memory_usage("After GPU test")
        
        print("\n⚡ Benchmarking with AMP:")
        gpu_amp_time = benchmark_model(model, device_gpu, input_tensor, use_amp=True)
        print_memory_usage("After AMP test")
        
        print("\n📈 Results Summary:")
        print(f"| Device | Inference Time (ms) | Speedup vs CPU |")
        print("|--------|---------------------|----------------|")
        print(f"| CPU    | {cpu_time:19.2f} | {'—':^15} |")
        print(f"| GPU    | {gpu_time:19.2f} | {cpu_time/gpu_time:^15.1f}x |")
        print(f"| AMP    | {gpu_amp_time:19.2f} | {cpu_time/gpu_amp_time:^15.1f}x |")
    else:
        print("\n❌ CUDA not available")
        print(f"⏱️ CPU inference time: {cpu_time:.2f} ms")
        if dataset_path:
            print("\n🎯 Quality Metrics (CPU):")
            print(f"Precision: {precision_cpu:.4f}")
            print(f"Recall:    {recall_cpu:.a4f}")
            print(f"F1-Score:  {f1_cpu:.4f}")

    # Загрузка и расчет метрик качества
    if dataset_path:
        print("\n📊 Loading ImageNetMini dataset...")
        val_dataset = load_imagenet_mini(dataset_path, model, 'val')
        data_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, pin_memory=True, drop_last=False)
        
        # print("\n🧮 Calculating metrics on CPU:")
        # precision_cpu, recall_cpu, f1_cpu = calculate_metrics(model, device_cpu, data_loader)
        
        if torch.cuda.is_available():
            print("\n🧮 Calculating metrics on GPU:")
            precision_gpu, recall_gpu, f1_gpu = calculate_metrics(model, device_gpu, data_loader)

            print("\n🎯 Quality Metrics Summary:")
            print("| Device | Precision | Recall  | F1-Score |")
            print("|--------|-----------|---------|----------|")
            # print(f"| CPU    | {precision_cpu:.4f}  | {recall_cpu:.4f} | {f1_cpu:.4f}  |")
            print(f"| GPU    | {precision_gpu:.4f}  | {recall_gpu:.4f} | {f1_gpu:.4f}  |")

In [4]:
path = kagglehub.dataset_download("ifigotin/imagenetmini-1000")
path += "imagenet-mini"

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/imagenet-mini


In [5]:
base_model = timm.create_model('efficientvit_b3.r256_in1k', pretrained=True)

model.safetensors:   0%|          | 0.00/195M [00:00<?, ?B/s]

In [6]:
main(base_model, path)


🔍 Initial memory state:

--- Memory Usage (Before loading model) ---
CPU RAM used: 940.84 MB
GPU 0 VRAM: 3.00 MB / 16384.00 MB
📏 Model size: 185.75 MB

🔍 Initial memory state:

--- Memory Usage (Model loaded) ---
CPU RAM used: 940.97 MB
GPU 0 VRAM: 3.00 MB / 16384.00 MB

🧪 Benchmarking on CPU:

🔥 Warming up (3 runs) on cpu...
🚀 Benchmarking (10 runs) on cpu...
✅ Average inference: 123.89 ms
📊 Total time: 1238.88 ms | FPS: 8.1

--- Memory Usage (After CPU benchmark) ---
CPU RAM used: 985.72 MB
GPU 0 VRAM: 3.00 MB / 16384.00 MB

🎮 Benchmarking on GPU:

🔥 Warming up (3 runs) on cuda...
🚀 Benchmarking (10 runs) on cuda...
✅ Average inference: 21.78 ms
📊 Total time: 217.81 ms | FPS: 45.9

--- Memory Usage (After GPU test) ---
CPU RAM used: 1102.50 MB
GPU 0 VRAM: 539.00 MB / 16384.00 MB

⚡ Benchmarking with AMP:

🔥 Warming up (3 runs) on cuda...
🚀 Benchmarking (10 runs) on cuda...
✅ Average inference: 26.33 ms
📊 Total time: 263.29 ms | FPS: 38.0

--- Memory Usage (After AMP test) ---
CPU RA

  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
original_dataset = load_imagenet_mini(path, base_model, 'train')

train_dataset, val_dataset = random_split(original_dataset, [0.8, 0.2])
test_dataset = load_imagenet_mini(path, base_model, 'val')

Compose(
    Resize(size=256, interpolation=bicubic, max_size=None, antialias=True)
    CenterCrop(size=(256, 256))
    MaybeToTensor()
    Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
)
Compose(
    Resize(size=256, interpolation=bicubic, max_size=None, antialias=True)
    CenterCrop(size=(256, 256))
    MaybeToTensor()
    Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
)


In [8]:
batch_size = 32

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=4,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

In [9]:
def train_one_epoch(
    model: nn.Module,
    train_loader: DataLoader,
    optimizer: optim.Optimizer,
    criterion: nn.Module,
    device: str,
):
    model.train()

    loss_accum = 0

    for img_batch, targets in tqdm(train_loader, desc='Training model'):
        optimizer.zero_grad()

        img_batch = img_batch.to(device)
        targets = targets.to(device)

        output = model(img_batch)

        loss = criterion(output, targets)
        loss.backward()

        optimizer.step()

        loss_accum += loss.item()

    return loss_accum / len(train_loader)


def validate_model(
    model: nn.Module,
    val_loader: DataLoader,
    criterion: nn.Module,
    device: str,
):
    predictions = []
    ground_truth = []
    loss_accum = 0

    model.eval()

    for img_batch, targets in tqdm(val_loader, desc='Validating model'):
        img_batch = img_batch.to(device)
        targets = targets.to(device)
        
        with torch.no_grad():
            output = model(img_batch)
            loss = criterion(output, targets)

        loss_accum += loss.item()
        
        preds = output.argmax(dim=1).cpu().numpy()
        predictions.extend(preds)
        ground_truth.extend(targets.cpu().numpy().tolist())

    return loss_accum / len(val_loader), f1_score(ground_truth, predictions, average='macro')

## Unstructured pruning

In [10]:
epoch_num = 1
learning_rate = 1e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'

optimizer = optim.Adam(base_model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
best_score = 0

## Pruning section
params_to_prune = []
for name, module in base_model.named_modules():
    if isinstance(module, nn.Conv2d):
        params_to_prune.append((module, 'weight'))

# for module, param_name in params_to_prune:
#     prune.ln_structured(module, param_name, amount=0.2, n=2, dim=0)

prune.global_unstructured(
    params_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.2,
)
#############

base_model.to(device);

In [11]:
print('BEFORE FINETUNING')
main(base_model, path)

BEFORE FINETUNING

🔍 Initial memory state:

--- Memory Usage (Before loading model) ---
CPU RAM used: 1351.16 MB
GPU 0 VRAM: 2231.00 MB / 16384.00 MB
📏 Model size: 338.65 MB

🔍 Initial memory state:

--- Memory Usage (Model loaded) ---
CPU RAM used: 1351.41 MB
GPU 0 VRAM: 2231.00 MB / 16384.00 MB

🧪 Benchmarking on CPU:

🔥 Warming up (3 runs) on cpu...
🚀 Benchmarking (10 runs) on cpu...
✅ Average inference: 147.70 ms
📊 Total time: 1476.99 ms | FPS: 6.8

--- Memory Usage (After CPU benchmark) ---
CPU RAM used: 1889.28 MB
GPU 0 VRAM: 2231.00 MB / 16384.00 MB

🎮 Benchmarking on GPU:

🔥 Warming up (3 runs) on cuda...
🚀 Benchmarking (10 runs) on cuda...
✅ Average inference: 28.54 ms
📊 Total time: 285.42 ms | FPS: 35.0

--- Memory Usage (After GPU test) ---
CPU RAM used: 1806.46 MB
GPU 0 VRAM: 2241.00 MB / 16384.00 MB

⚡ Benchmarking with AMP:

🔥 Warming up (3 runs) on cuda...
🚀 Benchmarking (10 runs) on cuda...
✅ Average inference: 32.77 ms
📊 Total time: 327.66 ms | FPS: 30.5

--- Memory Us

  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
for epoch_ind in range(1, epoch_num + 1):
    train_loss = train_one_epoch(base_model, train_loader, optimizer, criterion, device)
    val_loss, f1_score_val = validate_model(base_model, val_loader, criterion, device)

    print(f'Epoch #{epoch_ind} Train Loss: {round(train_loss, 5)} Val Loss: {round(val_loss, 5)} F1: {round(f1_score_val, 5)}')

    # if f1_score_val > best_score:
    #     best_score = f1_score_val
    #     torch.save(student_model.state_dict(), 'best_model.pt')

for module, param_name in params_to_prune:
    prune.remove(module, param_name)

Training model: 100%|██████████| 869/869 [11:47<00:00,  1.23it/s]
Validating model: 100%|██████████| 218/218 [02:43<00:00,  1.34it/s]

Epoch #1 Train Loss: 0.40074 Val Loss: 0.43607 F1: 0.87727





In [13]:
print('AFTER FINETUNING')
main(base_model, path)

AFTER FINETUNING

🔍 Initial memory state:

--- Memory Usage (Before loading model) ---
CPU RAM used: 2142.76 MB
GPU 0 VRAM: 11919.00 MB / 16384.00 MB
📏 Model size: 185.75 MB

🔍 Initial memory state:

--- Memory Usage (Model loaded) ---
CPU RAM used: 2142.76 MB
GPU 0 VRAM: 11919.00 MB / 16384.00 MB

🧪 Benchmarking on CPU:

🔥 Warming up (3 runs) on cpu...
🚀 Benchmarking (10 runs) on cpu...
✅ Average inference: 113.22 ms
📊 Total time: 1132.17 ms | FPS: 8.8

--- Memory Usage (After CPU benchmark) ---
CPU RAM used: 2161.13 MB
GPU 0 VRAM: 11919.00 MB / 16384.00 MB

🎮 Benchmarking on GPU:

🔥 Warming up (3 runs) on cuda...
🚀 Benchmarking (10 runs) on cuda...
✅ Average inference: 22.18 ms
📊 Total time: 221.81 ms | FPS: 45.1

--- Memory Usage (After GPU test) ---
CPU RAM used: 2161.13 MB
GPU 0 VRAM: 11919.00 MB / 16384.00 MB

⚡ Benchmarking with AMP:

🔥 Warming up (3 runs) on cuda...
🚀 Benchmarking (10 runs) on cuda...
✅ Average inference: 27.11 ms
📊 Total time: 271.05 ms | FPS: 36.9

--- Memory

  _warn_prf(average, modifier, msg_start, len(result))


## Structured pruning

Чтобы действительно ускорить модель и снизить потребляемую память необходимо использовать структурный прунинг. В PyTorch нет механизма удаления весов и обновления архитектуры после такого прунинга, поэтому решено использовать пакет [`Torch-Pruning`](https://github.com/VainF/Torch-Pruning)

In [14]:
!pip install torch-pruning

Collecting torch-pruning
  Downloading torch_pruning-1.5.2-py3-none-any.whl.metadata (31 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->torch-pruning)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->torch-pruning)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch->torch-pruning)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch->torch-pruning)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch->torch-pruning)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch->torch-pruning)
  Downlo

In [15]:
import torch_pruning as tp

In [16]:
torch.cuda.empty_cache()

In [17]:
base_model = timm.create_model('efficientvit_b3.r256_in1k', pretrained=True)
example_inputs = torch.randn(1, 3, 256, 256)
imp = tp.importance.GroupMagnitudeImportance(p=2)

In [18]:
# Ignoring output linear layer
ignored_layers = []
for m in base_model.modules():
    if isinstance(m, torch.nn.Linear) and m.out_features == 1000:
        ignored_layers.append(m)

pruner = tp.pruner.BasePruner(
    base_model,
    example_inputs,
    importance=imp,
    pruning_ratio=0.2,
    ignored_layers=ignored_layers,
    isomorphic=True,
    global_pruning=True,
    round_to=4,
)

In [19]:
base_macs, base_nparams = tp.utils.count_ops_and_params(base_model, example_inputs)
tp.utils.print_tool.before_pruning(base_model) # or print(model)
pruner.step()
tp.utils.print_tool.after_pruning(base_model) # or print(model), this util will show the difference before and after pruning
macs, nparams = tp.utils.count_ops_and_params(base_model, example_inputs)

EfficientVit(
  (stem): Stem(
    (in_conv): ConvNormAct(
      (dropout): Dropout(p=0.0, inplace=False)
      (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) => (conv): Conv2d(3, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (norm): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) => (norm): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act): Hardswish()
    )
    (res0): ResidualBlock(
      (pre_norm): Identity()
      (main): DSConv(
        (depth_conv): ConvNormAct(
          (dropout): Dropout(p=0.0, inplace=False)
          (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False) => (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=24, bias=False)
          (norm): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) => (norm): BatchNorm2d(24, eps=

In [20]:
print(f"MACs: {base_macs/1e9} G -> {macs/1e9} G, #Params: {base_nparams/1e6} M -> {nparams/1e6} M")

MACs: 5.121140712 G -> 3.23978212 G, #Params: 48.646056 M -> 31.040108 M


In [21]:
print('BEFORE FINETUNING')
main(base_model, path)

BEFORE FINETUNING

🔍 Initial memory state:

--- Memory Usage (Before loading model) ---
CPU RAM used: 2539.85 MB
GPU 0 VRAM: 1721.00 MB / 16384.00 MB
📏 Model size: 118.55 MB

🔍 Initial memory state:

--- Memory Usage (Model loaded) ---
CPU RAM used: 2539.85 MB
GPU 0 VRAM: 1721.00 MB / 16384.00 MB

🧪 Benchmarking on CPU:

🔥 Warming up (3 runs) on cpu...
🚀 Benchmarking (10 runs) on cpu...
✅ Average inference: 112.30 ms
📊 Total time: 1123.01 ms | FPS: 8.9

--- Memory Usage (After CPU benchmark) ---
CPU RAM used: 2555.35 MB
GPU 0 VRAM: 1721.00 MB / 16384.00 MB

🎮 Benchmarking on GPU:

🔥 Warming up (3 runs) on cuda...
🚀 Benchmarking (10 runs) on cuda...
✅ Average inference: 22.94 ms
📊 Total time: 229.41 ms | FPS: 43.6

--- Memory Usage (After GPU test) ---
CPU RAM used: 2555.35 MB
GPU 0 VRAM: 1747.00 MB / 16384.00 MB

⚡ Benchmarking with AMP:

🔥 Warming up (3 runs) on cuda...
🚀 Benchmarking (10 runs) on cuda...
✅ Average inference: 29.44 ms
📊 Total time: 294.37 ms | FPS: 34.0

--- Memory Us

  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
epoch_num = 5
learning_rate = 1e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'

optimizer = optim.Adam(base_model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
best_score = 0

base_model.to(device);

In [23]:
for epoch_ind in range(1, epoch_num + 1):
    train_loss = train_one_epoch(base_model, train_loader, optimizer, criterion, device)
    val_loss, f1_score_val = validate_model(base_model, val_loader, criterion, device)

    print(f'Epoch #{epoch_ind} Train Loss: {round(train_loss, 5)} Val Loss: {round(val_loss, 5)} F1: {round(f1_score_val, 5)}')

    # if f1_score_val > best_score:
    #     best_score = f1_score_val
    #     torch.save(student_model.state_dict(), 'best_model.pt')

Training model: 100%|██████████| 869/869 [11:38<00:00,  1.24it/s]
Validating model: 100%|██████████| 218/218 [02:42<00:00,  1.34it/s]


Epoch #1 Train Loss: 2.25801 Val Loss: 1.70385 F1: 0.58277


Training model: 100%|██████████| 869/869 [11:34<00:00,  1.25it/s]
Validating model: 100%|██████████| 218/218 [02:38<00:00,  1.37it/s]


Epoch #2 Train Loss: 1.06818 Val Loss: 1.4825 F1: 0.62469


Training model: 100%|██████████| 869/869 [11:29<00:00,  1.26it/s]
Validating model: 100%|██████████| 218/218 [02:39<00:00,  1.37it/s]


Epoch #3 Train Loss: 0.62608 Val Loss: 1.3566 F1: 0.66407


Training model: 100%|██████████| 869/869 [11:31<00:00,  1.26it/s]
Validating model: 100%|██████████| 218/218 [02:38<00:00,  1.38it/s]


Epoch #4 Train Loss: 0.25457 Val Loss: 1.36285 F1: 0.66252


Training model: 100%|██████████| 869/869 [11:25<00:00,  1.27it/s]
Validating model: 100%|██████████| 218/218 [02:39<00:00,  1.37it/s]

Epoch #5 Train Loss: 0.13202 Val Loss: 1.49056 F1: 0.65153





In [24]:
print('AFTER FINETUNING')
main(base_model, path)

AFTER FINETUNING

🔍 Initial memory state:

--- Memory Usage (Before loading model) ---
CPU RAM used: 2571.00 MB
GPU 0 VRAM: 9169.00 MB / 16384.00 MB
📏 Model size: 118.55 MB

🔍 Initial memory state:

--- Memory Usage (Model loaded) ---
CPU RAM used: 2571.00 MB
GPU 0 VRAM: 9169.00 MB / 16384.00 MB

🧪 Benchmarking on CPU:

🔥 Warming up (3 runs) on cpu...
🚀 Benchmarking (10 runs) on cpu...
✅ Average inference: 110.98 ms
📊 Total time: 1109.82 ms | FPS: 9.0

--- Memory Usage (After CPU benchmark) ---
CPU RAM used: 2597.00 MB
GPU 0 VRAM: 9169.00 MB / 16384.00 MB

🎮 Benchmarking on GPU:

🔥 Warming up (3 runs) on cuda...
🚀 Benchmarking (10 runs) on cuda...
✅ Average inference: 22.66 ms
📊 Total time: 226.63 ms | FPS: 44.1

--- Memory Usage (After GPU test) ---
CPU RAM used: 2597.00 MB
GPU 0 VRAM: 9169.00 MB / 16384.00 MB

⚡ Benchmarking with AMP:

🔥 Warming up (3 runs) on cuda...
🚀 Benchmarking (10 runs) on cuda...
✅ Average inference: 28.42 ms
📊 Total time: 284.25 ms | FPS: 35.2

--- Memory Usa

  _warn_prf(average, modifier, msg_start, len(result))


**Вывод:**
Данный метод действительно позволяет сократить потребляемую память и ускоряет работу модели на процессоре, однако на GPU существенного прироста нет. Кроме того приходится дольше файнтьюнить модель, чтобы она вернула себе какую-то точность