In [None]:
import time
import torch

class TinyClassifier(torch.nn.Module) :
  def __init__(self , dim = 128 , hidden=64 , out=2) :
    super().__init__()
    self.net = torch.nn.Sequential(
        torch.nn.Linear(dim , hidden),
        torch.nn.ReLU(),
        torch.nn.Linear(hidden,out)
    )

  def forward(self , x) :
      return self.net(x)

def benchmark() :
  torch.manual_seed(0)
  model = TinyClassifier(dim=128)
  model.eval()

  qmodel = torch.quantization.quantize_dynamic(model , {torch.nn.Linear} , dtype=torch.qint8)

  inputs = torch.randn(1,128)

  for _ in range(10) :
    _ = qmodel(inputs)

  iters = 100
  t0 = time.time()
  for _ in range(iters) :
    _ = qmodel(inputs)
  t1 = time.time()
  print(f"Avg inference time (Quantized tiny model) = {(t1-t0)/iters*1000:.2f}ms")

if __name__ == "__main__" :
  benchmark()

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import time
import psutil

# Load pretrained DistilGPT-2 (a small generative model)
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()

# Measure baseline memory usage
process = psutil.Process()
mem_before = process.memory_info().rss / 1e6

# Quantize the model
qmodel = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

mem_after = process.memory_info().rss / 1e6

# Sample input
inputs = tokenizer("The future of AI is", return_tensors="pt")

# Warm-up
for _ in range(5):
    _ = qmodel.generate(**inputs, max_length=30)

# Benchmark inference speed
iters = 20
t0 = time.time()
for _ in range(iters):
    _ = qmodel.generate(**inputs, max_length=30)
t1 = time.time()

print(f"Avg inference time (quantized): {(t1 - t0)/iters*1000:.2f} ms")
print(f"Memory before quantization: {mem_before:.2f} MB")
print(f"Memory after quantization:  {mem_after:.2f} MB")
