In [1]:
!pip install torch torchvision psutil humanize gputil scikit-learn torchvision

Collecting gputil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-py3-none-any.whl size=7392 sha256=eed8b9be88b2d37a889548cba133830264dc30dc7622a33cdc51dd9b692f57f4
  Stored in directory: /root/.cache/pip/wheels/a9/8a/bd/81082387151853ab8b6b3ef33426e98f5cbfebc3c397a9d4d0
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0


In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ifigotin/imagenetmini-1000")
path += "/imagenet-mini"

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/imagenetmini-1000/imagenet-mini


In [3]:
import torch
import time
import timm
import psutil
import os
import GPUtil
import numpy as np
from torch.amp import autocast
from torchvision import datasets, transforms
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
from sklearn.metrics import precision_score, recall_score, f1_score

def print_memory_usage(label=""):
    """–í—ã–≤–æ–¥–∏—Ç –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ –ø–∞–º—è—Ç–∏ –≤ –º–µ–≥–∞–±–∞–π—Ç–∞—Ö"""
    if label:
        print(f"\n--- Memory Usage ({label}) ---")
    else:
        print("\n--- Memory Usage ---")
        
    # CPU RAM –≤ MB
    process = psutil.Process(os.getpid())
    ram_used = process.memory_info().rss / (1024 ** 2)
    print(f"CPU RAM used: {ram_used:.2f} MB")
    
    # GPU VRAM –≤ MB
    gpus = GPUtil.getGPUs()
    for gpu in gpus:
        vram_used = gpu.memoryUsed
        vram_total = gpu.memoryTotal
        print(f"GPU {gpu.id} VRAM: {vram_used:.2f} MB / {vram_total:.2f} MB")

def get_model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    return (param_size + buffer_size) / 1024**2

def calculate_metrics(model, device, data_loader):
    model.eval()
    all_preds = []
    all_targets = []

    model.to(device)
    with torch.no_grad():
        for images, targets in data_loader:
            images = images.to(device)
            outputs = model(images)
            preds = outputs.argmax(dim=1).cpu().numpy()
            
            all_preds.extend(preds)
            all_targets.extend(targets.numpy())
    
    precision = precision_score(all_targets, all_preds, average='macro')
    recall = recall_score(all_targets, all_preds, average='macro')
    f1 = f1_score(all_targets, all_preds, average='macro')
    
    return precision, recall, f1

def load_imagenet_mini(dataset_path, model):
    # –°–æ–∑–¥–∞–µ–º —Ç—Ä–∞–Ω—Å—Ñ–æ—Ä–º—ã –Ω–∞ –æ—Å–Ω–æ–≤–µ –º–æ–¥–µ–ª–∏
    config = resolve_data_config({}, model=model)
    transform = create_transform(**config)
    
    # –ó–∞–≥—Ä—É–∂–∞–µ–º –¥–∞—Ç–∞—Å–µ—Ç
    dataset = datasets.ImageFolder(
        root=os.path.join(dataset_path, 'val'),
        transform=transform
    )
    
    data_loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=64,
        shuffle=False,
        num_workers=4,
        pin_memory=True
    )
    
    return data_loader

def benchmark_model(model, device, input_tensor, num_runs=10, warmup=3, use_amp=False):
    model = model.to(device)
    input_tensor = input_tensor.to(device)
    
    # Warmup
    print(f"\nüî• Warming up ({warmup} runs) on {device}...")
    for _ in range(warmup):
        with torch.no_grad():
            if use_amp and device.type == 'cuda':
                with autocast(device_type='cuda', dtype=torch.float16):
                    _ = model(input_tensor)
            else:
                _ = model(input_tensor)
    
    # Benchmark
    print(f"üöÄ Benchmarking ({num_runs} runs) on {device}...")
    start_time = time.time()
    
    for _ in range(num_runs):
        with torch.no_grad():
            if use_amp and device.type == 'cuda':
                with autocast(device_type='cuda', dtype=torch.float16):
                    _ = model(input_tensor)
            else:
                _ = model(input_tensor)
    
    total_time = (time.time() - start_time) * 1000
    avg_time = total_time / num_runs
    print(f"‚úÖ Average inference: {avg_time:.2f} ms")
    print(f"üìä Total time: {total_time:.2f} ms | FPS: {1000/(avg_time + 1e-9):.1f}")
    
    return avg_time

def main(dataset_path=None):
    device_cpu = torch.device('cpu')
    device_gpu = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    print("\nüîç Initial memory state:")
    print_memory_usage("Before loading model")

    print("\nüì¶ Loading EfficientViT_3b model...")
    model = timm.create_model('efficientvit_b3.r256_in1k', pretrained=True)
    model.eval()
    print(f"üìè Model size: {get_model_size(model):.2f} MB")

    print("\nüîç Initial memory state:")
    print_memory_usage("Model loaded")
    
    # –ë–µ–Ω—á–º–∞—Ä–∫–∏
    input_tensor = torch.randn(1, 3, 224, 224)
    
    print("\nüß™ Benchmarking on CPU:")
    cpu_time = benchmark_model(model, device_cpu, input_tensor)
    print_memory_usage("After CPU benchmark")
    
    if torch.cuda.is_available():
        print("\nüéÆ Benchmarking on GPU:")
        gpu_time = benchmark_model(model, device_gpu, input_tensor)
        print_memory_usage("After GPU test")
        
        print("\n‚ö° Benchmarking with AMP:")
        gpu_amp_time = benchmark_model(model, device_gpu, input_tensor, use_amp=True)
        print_memory_usage("After AMP test")
        
        print("\nüìà Results Summary:")
        print(f"| Device | Inference Time (ms) | Speedup vs CPU |")
        print("|--------|---------------------|----------------|")
        print(f"| CPU    | {cpu_time:19.2f} | {'‚Äî':^15} |")
        print(f"| GPU    | {gpu_time:19.2f} | {cpu_time/gpu_time:^15.1f}x |")
        print(f"| AMP    | {gpu_amp_time:19.2f} | {cpu_time/gpu_amp_time:^15.1f}x |")
    else:
        print("\n‚ùå CUDA not available")
        print(f"‚è±Ô∏è CPU inference time: {cpu_time:.2f} ms")
        if dataset_path:
            print("\nüéØ Quality Metrics (CPU):")
            print(f"Precision: {precision_cpu:.4f}")
            print(f"Recall:    {recall_cpu:.4f}")
            print(f"F1-Score:  {f1_cpu:.4f}")

    # –ó–∞–≥—Ä—É–∑–∫–∞ –∏ —Ä–∞—Å—á–µ—Ç –º–µ—Ç—Ä–∏–∫ –∫–∞—á–µ—Å—Ç–≤–∞
    if dataset_path:
        print("\nüìä Loading ImageNetMini dataset...")
        data_loader = load_imagenet_mini(dataset_path, model)
        
        # print("\nüßÆ Calculating metrics on CPU:")
        # precision_cpu, recall_cpu, f1_cpu = calculate_metrics(model, device_cpu, data_loader)
        
        if torch.cuda.is_available():
            print("\nüßÆ Calculating metrics on GPU:")
            precision_gpu, recall_gpu, f1_gpu = calculate_metrics(model, device_gpu, data_loader)

            print("\nüéØ Quality Metrics Summary:")
            print("| Device | Precision | Recall  | F1-Score |")
            print("|--------|-----------|---------|----------|")
            # print(f"| CPU    | {precision_cpu:.4f}  | {recall_cpu:.4f} | {f1_cpu:.4f}  |")
            print(f"| GPU    | {precision_gpu:.4f}  | {recall_gpu:.4f} | {f1_gpu:.4f}  |")

if __name__ == "__main__":
    main(dataset_path=path)


üîç Initial memory state:

--- Memory Usage (Before loading model) ---
CPU RAM used: 569.39 MB
GPU 0 VRAM: 3.00 MB / 16384.00 MB

üì¶ Loading EfficientViT_3b model...


model.safetensors:   0%|          | 0.00/195M [00:00<?, ?B/s]

üìè Model size: 185.75 MB

üîç Initial memory state:

--- Memory Usage (Model loaded) ---
CPU RAM used: 764.75 MB
GPU 0 VRAM: 3.00 MB / 16384.00 MB

üß™ Benchmarking on CPU:

üî• Warming up (3 runs) on cpu...
üöÄ Benchmarking (10 runs) on cpu...
‚úÖ Average inference: 113.27 ms
üìä Total time: 1132.66 ms | FPS: 8.8

--- Memory Usage (After CPU benchmark) ---
CPU RAM used: 805.37 MB
GPU 0 VRAM: 3.00 MB / 16384.00 MB

üéÆ Benchmarking on GPU:

üî• Warming up (3 runs) on cuda...
üöÄ Benchmarking (10 runs) on cuda...
‚úÖ Average inference: 21.26 ms
üìä Total time: 212.57 ms | FPS: 47.0

--- Memory Usage (After GPU test) ---
CPU RAM used: 934.02 MB
GPU 0 VRAM: 539.00 MB / 16384.00 MB

‚ö° Benchmarking with AMP:

üî• Warming up (3 runs) on cuda...
üöÄ Benchmarking (10 runs) on cuda...
‚úÖ Average inference: 26.91 ms
üìä Total time: 269.12 ms | FPS: 37.2

--- Memory Usage (After AMP test) ---
CPU RAM used: 952.39 MB
GPU 0 VRAM: 615.00 MB / 16384.00 MB

üìà Results Summary:
| Dev

  _warn_prf(average, modifier, msg_start, len(result))
