In [1]:
!pip install -q \
    onnx \
    onnxruntime \
    onnxruntime-tools \
    tqdm


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m67.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.7/212.7 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [7]:
import shutil
import os

FP32_MODEL_SRC = "/kaggle/input/inception-v3/onnx/default/1/inception_v3.onnx"
FP32_MODEL_LOCAL = "/kaggle/working/inception_v3.onnx"

if not os.path.exists(FP32_MODEL_LOCAL):
    shutil.copy(FP32_MODEL_SRC, FP32_MODEL_LOCAL)

print("Copied model to writable path:", FP32_MODEL_LOCAL)


Copied model to writable path: /kaggle/working/inception_v3.onnx


In [11]:
import onnxruntime as ort
from onnxruntime.quantization import quantize_static, QuantType, CalibrationDataReader
from torchvision import transforms, datasets
from torch.utils.data import DataLoader, Subset
import numpy as np
import time
import os
import shutil
from tqdm import tqdm

# =====================================================
# PATHS (KAGGLE - VERIFIED)
# =====================================================
IMAGENET_PATH = "/kaggle/input/imagenet1kvalid"

# Original model in read-only input directory
ORIGINAL_FP32_PATH = "/kaggle/input/inception-v3/onnx/default/1/inception_v3.onnx"

# Copy to writable directory
FP32_MODEL_PATH = "/kaggle/working/inception_v3_fp32.onnx"
INT8_MODEL_PATH = "/kaggle/working/inception_v3_int8.onnx"

print("Original FP32 model exists:", os.path.isfile(ORIGINAL_FP32_PATH))
print("ImageNet exists:", os.path.isdir(IMAGENET_PATH))

print("\nCopying FP32 model to writable directory...")
shutil.copy(ORIGINAL_FP32_PATH, FP32_MODEL_PATH)
print("✓ Model copied to:", FP32_MODEL_PATH)

# CALIBRATION DATA READER

class InceptionCalibrationDataReader(CalibrationDataReader):
    def __init__(self, calibration_dataset, batch_size=16):
        self.loader = DataLoader(
            calibration_dataset,
            batch_size=batch_size,
            shuffle=False,
            num_workers=2
        )
        self.iterator = iter(self.loader)
        self.input_name = "input"

    def get_next(self):
        try:
            images, _ = next(self.iterator)
            return {self.input_name: images.numpy()}
        except StopIteration:
            return None


# PREPROCESSING

print("\n[1] Preparing preprocessing...")

preprocess = transforms.Compose([
    transforms.Resize(342),
    transforms.CenterCrop(299),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    ),
])

print("✓ Preprocessing ready")

# DATASET

print("\n[2] Loading ImageNet validation dataset...")

val_dataset = datasets.ImageFolder(
    root=IMAGENET_PATH,
    transform=preprocess
)

# small numbers first (good practice)
calibration_dataset = Subset(val_dataset, range(1000))
inference_dataset = Subset(val_dataset, range(5000))

print("✓ Dataset loaded")
print("  Calibration images:", len(calibration_dataset))
print("  Inference images:", len(inference_dataset))

# STATIC QUANTIZATION

print("\n[3] Running static INT8 quantization...")

calibration_reader = InceptionCalibrationDataReader(calibration_dataset)

quantize_static(
    model_input=FP32_MODEL_PATH,
    model_output=INT8_MODEL_PATH,
    calibration_data_reader=calibration_reader,
    quant_format=QuantType.QInt8,
    per_channel=False,
    weight_type=QuantType.QInt8,
    activation_type=QuantType.QInt8,
)

print("✓ INT8 model saved to:", INT8_MODEL_PATH)


# INFERENCE FUNCTION (WITH PROGRESS BAR)

def run_inference(model_path, loader, tag):
    providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
    session = ort.InferenceSession(model_path, providers=providers)
    input_name = session.get_inputs()[0].name
    
    print(f"  Active provider: {session.get_providers()[0]}")

    correct1 = correct5 = total = 0
    start = time.time()

    for images, labels in tqdm(loader, desc=tag):
        outputs = session.run(None, {input_name: images.numpy()})
        logits = outputs[0]

        pred1 = np.argmax(logits, axis=1)
        pred5 = np.argsort(logits, axis=1)[:, -5:]

        correct1 += (pred1 == labels.numpy()).sum()
        correct5 += sum(
            label in pred5[i]
            for i, label in enumerate(labels.numpy())
        )
        total += labels.size(0)

    elapsed = time.time() - start

    print(
        f"\n{tag} | "
        f"Top-1: {100 * correct1 / total:.2f}% | "
        f"Top-5: {100 * correct5 / total:.2f}% | "
        f"Time: {elapsed:.2f}s"
    )

    return elapsed


print("\n[4] Running evaluation...")

inference_loader = DataLoader(
    inference_dataset,
    batch_size=16,
    shuffle=False,
    num_workers=2
)

print("\nFP32 inference...")
fp32_time = run_inference(FP32_MODEL_PATH, inference_loader, "FP32")

print("\nINT8 inference...")
int8_time = run_inference(INT8_MODEL_PATH, inference_loader, "INT8")

print("\n" + "="*50)
print(f"Speed-up: {fp32_time / int8_time:.2f}x")
print("="*50)

Original FP32 model exists: True
ImageNet exists: True

Copying FP32 model to writable directory...
✓ Model copied to: /kaggle/working/inception_v3_fp32.onnx

[1] Preparing preprocessing...
✓ Preprocessing ready

[2] Loading ImageNet validation dataset...
✓ Dataset loaded
  Calibration images: 1000
  Inference images: 5000

[3] Running static INT8 quantization...




✓ INT8 model saved to: /kaggle/working/inception_v3_int8.onnx

[4] Running evaluation...

FP32 inference...
  Active provider: CPUExecutionProvider


FP32: 100%|██████████| 313/313 [05:10<00:00,  1.01it/s]



FP32 | Top-1: 82.16% | Top-5: 96.08% | Time: 310.99s

INT8 inference...
  Active provider: CPUExecutionProvider


INT8: 100%|██████████| 313/313 [05:19<00:00,  1.02s/it]


INT8 | Top-1: 81.14% | Top-5: 95.56% | Time: 319.41s

Speed-up: 0.97x





In [12]:
import onnxruntime as ort
from onnxruntime.quantization import quantize_static, QuantType, CalibrationDataReader
from torchvision import transforms, datasets
from torch.utils.data import DataLoader, Subset
import numpy as np
import time
import os
import shutil
from tqdm import tqdm

# =====================================================
# PATHS (KAGGLE - VERIFIED)
# =====================================================
IMAGENET_PATH = "/kaggle/input/imagenet1kvalid"

ORIGINAL_FP32_PATH = "/kaggle/input/inception-v3/onnx/default/1/inception_v3.onnx"

FP32_MODEL_PATH = "/kaggle/working/inception_v3_fp32.onnx"
INT8_ASYM_MODEL_PATH = "/kaggle/working/inception_v3_int8_asymmetric.onnx"

print("Original FP32 model exists:", os.path.isfile(ORIGINAL_FP32_PATH))
print("ImageNet exists:", os.path.isdir(IMAGENET_PATH))

print("\nCopying FP32 model to writable directory...")
shutil.copy(ORIGINAL_FP32_PATH, FP32_MODEL_PATH)
print("✓ Model copied to:", FP32_MODEL_PATH)

# =====================================================
# CALIBRATION DATA READER
# =====================================================
class InceptionCalibrationDataReader(CalibrationDataReader):
    def __init__(self, calibration_dataset, batch_size=16):
        self.loader = DataLoader(
            calibration_dataset,
            batch_size=batch_size,
            shuffle=False,
            num_workers=2
        )
        self.iterator = iter(self.loader)
        self.input_name = "input"

    def get_next(self):
        try:
            images, _ = next(self.iterator)
            return {self.input_name: images.numpy()}
        except StopIteration:
            return None

# =====================================================
# PREPROCESSING
# =====================================================
print("\n[1] Preparing preprocessing...")

preprocess = transforms.Compose([
    transforms.Resize(342),
    transforms.CenterCrop(299),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    ),
])

print("✓ Preprocessing ready")

# =====================================================
# DATASET
# =====================================================
print("\n[2] Loading ImageNet validation dataset...")

val_dataset = datasets.ImageFolder(
    root=IMAGENET_PATH,
    transform=preprocess
)

calibration_dataset = Subset(val_dataset, range(1000))
inference_dataset = Subset(val_dataset, range(5000))

print("✓ Dataset loaded")
print("  Calibration images:", len(calibration_dataset))
print("  Inference images:", len(inference_dataset))

# =====================================================
# ASYMMETRIC STATIC QUANTIZATION
# =====================================================
print("\n[3] Running STATIC ASYMMETRIC INT8 quantization...")

calibration_reader = InceptionCalibrationDataReader(calibration_dataset)

quantize_static(
    model_input=FP32_MODEL_PATH,
    model_output=INT8_ASYM_MODEL_PATH,
    calibration_data_reader=calibration_reader,
    weight_type=QuantType.QInt8,       # symmetric weights
    activation_type=QuantType.QUInt8,  # asymmetric activations
    per_channel=False,
)

print("✓ Asymmetric INT8 model saved to:", INT8_ASYM_MODEL_PATH)

# =====================================================
# INFERENCE FUNCTION
# =====================================================
def run_inference(model_path, loader, tag):
    providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
    session = ort.InferenceSession(model_path, providers=providers)

    input_name = session.get_inputs()[0].name
    print(f"  Active provider: {session.get_providers()[0]}")

    correct1 = correct5 = total = 0
    start = time.time()

    for images, labels in tqdm(loader, desc=tag):
        outputs = session.run(None, {input_name: images.numpy()})
        logits = outputs[0]

        pred1 = np.argmax(logits, axis=1)
        pred5 = np.argsort(logits, axis=1)[:, -5:]

        correct1 += (pred1 == labels.numpy()).sum()
        correct5 += sum(
            label in pred5[i]
            for i, label in enumerate(labels.numpy())
        )
        total += labels.size(0)

    elapsed = time.time() - start

    print(
        f"\n{tag} | "
        f"Top-1: {100 * correct1 / total:.2f}% | "
        f"Top-5: {100 * correct5 / total:.2f}% | "
        f"Time: {elapsed:.2f}s"
    )

    return elapsed

# =====================================================
# EVALUATION
# =====================================================
print("\n[4] Running evaluation...")

inference_loader = DataLoader(
    inference_dataset,
    batch_size=16,
    shuffle=False,
    num_workers=2
)

print("\nFP32 inference...")
fp32_time = run_inference(FP32_MODEL_PATH, inference_loader, "FP32")

print("\nINT8 ASYMMETRIC inference...")
int8_time = run_inference(INT8_ASYM_MODEL_PATH, inference_loader, "INT8-ASYM")

print("\n" + "=" * 50)
print(f"Speed-up: {fp32_time / int8_time:.2f}x")
print("=" * 50)


Original FP32 model exists: True
ImageNet exists: True

Copying FP32 model to writable directory...
✓ Model copied to: /kaggle/working/inception_v3_fp32.onnx

[1] Preparing preprocessing...
✓ Preprocessing ready

[2] Loading ImageNet validation dataset...
✓ Dataset loaded
  Calibration images: 1000
  Inference images: 5000

[3] Running STATIC ASYMMETRIC INT8 quantization...




✓ Asymmetric INT8 model saved to: /kaggle/working/inception_v3_int8_asymmetric.onnx

[4] Running evaluation...

FP32 inference...
  Active provider: CPUExecutionProvider


FP32: 100%|██████████| 313/313 [05:18<00:00,  1.02s/it]



FP32 | Top-1: 82.16% | Top-5: 96.08% | Time: 318.36s

INT8 ASYMMETRIC inference...
  Active provider: CPUExecutionProvider


INT8-ASYM: 100%|██████████| 313/313 [04:36<00:00,  1.13it/s]


INT8-ASYM | Top-1: 81.30% | Top-5: 95.64% | Time: 276.19s

Speed-up: 1.15x



