In [0]:
!pip install transformers torch torchao accelerate bitsandbytes
# We could use uv in other enviroments as well

In [0]:
dbutils.library.restartPython()

## Quantization example

In [0]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os
import time

### ============================================
### 1. CARGAR MODELO NORMAL (FP32/FP16)
### ============================================

In [0]:
## Podemos cargar con Transformers de HF apoyándonos de su librería bitsandbytes

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Cargar modelo en FP32
model_fp32 = AutoModelForCausalLM.from_pretrained(
    "openai-community/gpt2",
    torch_dtype=torch.float32
)

# Cargar modelo en INT8 (automático con bitsandbytes)
model_int8 = AutoModelForCausalLM.from_pretrained(
    "openai-community/gpt2",
    load_in_8bit=True,
    device_map="auto"
)

# Medir tamaños
def get_model_size_mb(model):
    total_bytes = 0
    for param in model.parameters():
        total_bytes += param.nelement() * param.element_size()
    for buffer in model.buffers():
        total_bytes += buffer.nelement() * buffer.element_size()
    return total_bytes / (1024 * 1024)

size_fp32 = get_model_size_mb(model_fp32)
size_int8 = get_model_size_mb(model_int8)

print(f"\n=== COMPARACIÓN GPT-2 ===")
print(f"FP32: {size_fp32:.2f} MB")
print(f"INT8: {size_int8:.2f} MB")
print(f"Reducción: {(1 - size_int8/size_fp32)*100:.1f}%")

In [0]:
from transformers import AutoModel
import torch
import copy

# Cargar modelo FP32
model_fp32 = AutoModel.from_pretrained(
    "distilbert-base-uncased",
    torch_dtype=torch.float32
).cpu()

print(f"Modelo cargado: DistilBERT")

# Cuantizar
model_int8 = copy.deepcopy(model_fp32)
model_int8.eval()

model_int8 = torch.quantization.quantize_dynamic(
    model_int8,
    {torch.nn.Linear},
    dtype=torch.qint8
)

# Medir
def get_size(model):
    total = sum(p.nelement() * p.element_size() for p in model.parameters())
    total += sum(b.nelement() * b.element_size() for b in model.buffers())
    return total / (1024 * 1024)

size_fp32 = get_size(model_fp32)
size_int8 = get_size(model_int8)

print(f"\n=== COMPARACIÓN ===")
print(f"FP32: {size_fp32:.2f} MB")
print(f"INT8: {size_int8:.2f} MB")
print(f"Reducción: {(1 - size_int8/size_fp32)*100:.1f}%")

### Comparemos el trade-off ahora en velocidad de inferencia y precisión del modelo

In [0]:
import time
import torch

# Crear input de prueba
inputs = torch.randint(0, 30522, (1, 128))  # batch=1, seq_len=128

# Benchmark FP32
model_fp32.eval()
with torch.no_grad():
    start = time.time()
    for _ in range(100):
        _ = model_fp32(inputs)
    time_fp32 = (time.time() - start) / 100

# Benchmark INT8
model_int8.eval()
with torch.no_grad():
    start = time.time()
    for _ in range(100):
        _ = model_int8(inputs)
    time_int8 = (time.time() - start) / 100

print(f"\n=== VELOCIDAD DE INFERENCIA ===")
print(f"FP32: {time_fp32*1000:.2f} ms")
print(f"INT8: {time_int8*1000:.2f} ms")
print(f"Speedup: {time_fp32/time_int8:.2f}x")

In [0]:
import torch
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Texto de prueba
text = "The quick brown fox jumps over the lazy dog"
inputs = tokenizer(text, return_tensors="pt")

# Comparar outputs
with torch.no_grad():
    out_fp32 = model_fp32(**inputs).last_hidden_state
    out_int8 = model_int8(**inputs).last_hidden_state

# Calcular diferencias
diff = (out_fp32 - out_int8).abs()

print(f"\n=== PÉRDIDA DE PRECISIÓN ===")
print(f"Diferencia promedio: {diff.mean().item():.6f}")
print(f"Diferencia máxima:   {diff.max().item():.6f}")
print(f"Diferencia relativa: {(diff.mean() / out_fp32.abs().mean()).item()*100:.2f}%")