In [1]:
!pip install GPUtil



In [2]:
import os
import time
import torch
import psutil
import GPUtil
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import precision_score, recall_score, f1_score
import timm
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
from torchvision import datasets, transforms

In [3]:
import kagglehub
path = kagglehub.dataset_download("ifigotin/imagenetmini-1000")
path += "/imagenet-mini"

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/imagenetmini-1000/imagenet-mini


In [5]:
def load_imagenet_mini(dataset_path, model):
    # Создаем трансформы на основе модели
    config = resolve_data_config({}, model=model)
    transform = create_transform(**config)
    
    # Загружаем датасет
    dataset = datasets.ImageFolder(
        root=os.path.join(dataset_path, 'val'),
        transform=transform
    )
    
    data_loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=64,
        shuffle=False,
        num_workers=4,
        pin_memory=True
    )
    
    return data_loader

In [6]:
def print_memory_usage():
    proc = psutil.Process(os.getpid())
    ram = proc.memory_info().rss / 1024**2
    gpus = GPUtil.getGPUs()
    vram = sum(g.memoryUsed for g in gpus)
    return ram, vram

In [7]:
def benchmark_model(model, device, input_tensor, num_runs=10, warmup=3, use_amp=False):
    model = model.to(device)
    input_tensor = input_tensor.to(device)
    
    # Warmup
    print(f"\n🔥 Warming up ({warmup} runs) on {device}...")
    for _ in range(warmup):
        with torch.no_grad():
            if use_amp and device.type == 'cuda':
                with autocast(device_type='cuda', dtype=torch.float16):
                    _ = model(input_tensor)
            else:
                _ = model(input_tensor)
    
    # Benchmark
    print(f"🚀 Benchmarking ({num_runs} runs) on {device}...")
    start_time = time.time()
    
    for _ in range(num_runs):
        with torch.no_grad():
            if use_amp and device.type == 'cuda':
                with autocast(device_type='cuda', dtype=torch.float16):
                    _ = model(input_tensor)
            else:
                _ = model(input_tensor)
    
    total_time = (time.time() - start_time) * 1000
    avg_time = total_time / num_runs
    print(f"✅ Average inference: {avg_time:.2f} ms")
    print(f"📊 Total time: {total_time:.2f} ms | FPS: {1000/(avg_time + 1e-9):.1f}")
    
    return avg_time

def get_model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    return (param_size + buffer_size) / 1024**2

def calculate_metrics(model, device, data_loader):
    model.eval()
    all_preds = []
    all_targets = []

    model.to(device)
    with torch.no_grad():
        for images, targets in data_loader:
            images = images.to(device)
            outputs = model(images)
            preds = outputs.argmax(dim=1).cpu().numpy()
            
            all_preds.extend(preds)
            all_targets.extend(targets.numpy())
    
    precision = precision_score(all_targets, all_preds, average='macro')
    recall = recall_score(all_targets, all_preds, average='macro')
    f1 = f1_score(all_targets, all_preds, average='macro')
    
    return precision, recall, f1

In [8]:
input_tensor = torch.randn(1, 3, 224, 224)

In [9]:
model = timm.create_model('efficientvit_b3.r256_in1k', pretrained=True).eval()
loader = load_imagenet_mini(dataset_path=path, model=model)
baseline_size  = get_model_size(model)
baseline_cpu   = benchmark_model(model, torch.device('cpu'),  input_tensor)
baseline_gpu   = benchmark_model(model, torch.device('cuda'), input_tensor)
baseline_prec, baseline_rec, baseline_f1 = calculate_metrics(model, torch.device('cuda'), loader)


🔥 Warming up (3 runs) on cpu...
🚀 Benchmarking (10 runs) on cpu...
✅ Average inference: 121.76 ms
📊 Total time: 1217.61 ms | FPS: 8.2

🔥 Warming up (3 runs) on cuda...
🚀 Benchmarking (10 runs) on cuda...
✅ Average inference: 22.19 ms
📊 Total time: 221.87 ms | FPS: 45.1


  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
from sklearn.cluster import MiniBatchKMeans
from tqdm import tqdm

def cluster_model_weights(model, k=32, skip_prefix=('stem', 'head.')):
    """Заменяет веса на центроиды, возвращает оценку сжатого размера."""
    compressed_bytes = 0
    model.cpu()

    with torch.no_grad():
        for name, p in tqdm(model.named_parameters(), desc="Clustering"):
            if p.dim() < 2 or name.startswith(skip_prefix):
                continue                            # пропускаем bias, stem, head

            flat = p.detach().reshape(-1, 1).float().numpy()
            k_here = min(k, len(flat))
            kmeans = MiniBatchKMeans(k_here, batch_size=4096,
                                     max_iter=100, n_init='auto',
                                     random_state=0).fit(flat)
            
            centers = torch.tensor(kmeans.cluster_centers_.squeeze(),
                                   dtype=p.dtype)
            labels  = torch.from_numpy(kmeans.labels_).long()
            
            p.copy_(centers[labels].reshape_as(p))

            # считаем объём «индексы + центроиды»
            compressed_bytes += labels.numel()            # uint8 → 1 байт
            compressed_bytes += centers.numel() * 4       # float32 → 4 байта

    return compressed_bytes / 1_048_576                   # → МБ

In [17]:
compressed_size = cluster_model_weights(model, k=32)

cluster_cpu  = benchmark_model(model, torch.device('cpu'),  input_tensor)
cluster_gpu  = benchmark_model(model, torch.device('cuda'), input_tensor)

print(f"Сжатый размер: {compressed_size:.2f} MB  "
      f"(было {baseline_size:.2f} MB)")
print(f"CPU {baseline_cpu:.1f} → {cluster_cpu:.1f} ms   "
      f"GPU {baseline_gpu:.1f} → {cluster_gpu:.1f} ms")

Clustering: 316it [00:06, 46.33it/s]



🔥 Warming up (3 runs) on cpu...
🚀 Benchmarking (10 runs) on cpu...
✅ Average inference: 116.38 ms
📊 Total time: 1163.82 ms | FPS: 8.6

🔥 Warming up (3 runs) on cuda...
🚀 Benchmarking (10 runs) on cuda...
✅ Average inference: 22.30 ms
📊 Total time: 223.03 ms | FPS: 44.8
Сжатый размер: 37.11 MB  (было 185.75 MB)
CPU 121.8 → 106.4 ms   GPU 22.2 → 21.8 ms
