# Day 31: INT4 Quantization - Part 2

In this notebook, we'll explore INT4 quantization for large language models. INT4 quantization reduces precision even further than INT8, offering greater memory savings but with potential quality trade-offs.

## Overview

1. Setup and dependencies
2. Loading a pre-trained model
3. INT4 quantization with bitsandbytes
4. Measuring model size and memory usage
5. Comparing inference latency

## 1. Setup and Dependencies

In [None]:
!pip install -q torch transformers datasets evaluate accelerate bitsandbytes psutil

In [None]:
import os
import time
import torch
import numpy as np
import psutil
import gc
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Print PyTorch version
print(f"PyTorch version: {torch.__version__}")

## 2. Helper Functions

Let's define some helper functions to measure memory usage and model size.

In [None]:
# Function to measure memory usage
def get_memory_usage():
    """Get current memory usage in MB"""
    process = psutil.Process(os.getpid())
    memory_info = process.memory_info()
    memory_mb = memory_info.rss / (1024 * 1024)  # Convert to MB
    return memory_mb

# Function to count parameters
def count_parameters(model):
    """Count the number of parameters in a model"""
    return sum(p.numel() for p in model.parameters())

# Function to calculate model size in MB
def calculate_model_size(model, bits_per_param):
    """Calculate model size in MB based on parameter count and bits per parameter"""
    num_params = count_parameters(model)
    size_bytes = num_params * bits_per_param / 8  # Convert bits to bytes
    size_mb = size_bytes / (1024 * 1024)  # Convert to MB
    return size_mb

# Function to measure inference time
def measure_inference_time(model, tokenizer, prompt, num_runs=5):
    """Measure average inference time over multiple runs"""
    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Warm-up run
    with torch.no_grad():
        _ = model.generate(**inputs, max_length=50)
    
    # Measure inference time
    start_time = time.time()
    with torch.no_grad():
        for _ in range(num_runs):
            _ = model.generate(**inputs, max_length=50)
    end_time = time.time()
    
    avg_time = (end_time - start_time) / num_runs
    return avg_time

## 3. Loading a Pre-trained Model

We'll use a small language model for demonstration purposes.

In [None]:
# Define model name
model_name = "facebook/opt-350m"  # Using a smaller model for demonstration

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

In [None]:
# Record initial memory usage
initial_memory = get_memory_usage()
print(f"Initial memory usage: {initial_memory:.2f} MB")

# Load model in FP16 (half precision) for comparison
print("Loading model in FP16...")
fp16_model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float16
).to(device)

# Calculate model size and memory usage
fp16_params = count_parameters(fp16_model)
fp16_size = calculate_model_size(fp16_model, 16)  # 16 bits per parameter
fp16_memory = get_memory_usage() - initial_memory

print(f"FP16 model parameters: {fp16_params:,}")
print(f"FP16 model size: {fp16_size:.2f} MB")
print(f"FP16 model memory usage: {fp16_memory:.2f} MB")

## 4. INT4 Quantization with bitsandbytes

We'll use the bitsandbytes library through Hugging Face's integration to load the model in 4-bit precision.

In [None]:
# Clear memory before loading the INT4 model
del fp16_model
gc.collect()
torch.cuda.empty_cache() if torch.cuda.is_available() else None

# Reset memory baseline
initial_memory = get_memory_usage()
print(f"Memory after cleanup: {initial_memory:.2f} MB")

In [None]:
# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                # Load model in 4-bit precision
    bnb_4bit_use_double_quant=True,   # Use double quantization for 4-bit
    bnb_4bit_quant_type="nf4",        # Use NF4 format (normalized float 4)
    bnb_4bit_compute_dtype=torch.float16  # Compute in FP16
)

# Load model with INT4 quantization
print("Loading model with INT4 quantization...")
int4_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Calculate INT4 model size and memory usage
int4_params = count_parameters(int4_model)
int4_size = calculate_model_size(int4_model, 4)  # 4 bits per parameter
int4_memory = get_memory_usage() - initial_memory

print(f"INT4 model parameters: {int4_params:,}")
print(f"INT4 model size: {int4_size:.2f} MB")
print(f"INT4 model memory usage: {int4_memory:.2f} MB")

## 5. Loading INT8 Model for Comparison

Let's also load an INT8 model to compare with our INT4 model.

In [None]:
# Clear memory before loading the INT8 model
del int4_model
gc.collect()
torch.cuda.empty_cache() if torch.cuda.is_available() else None

# Reset memory baseline
initial_memory = get_memory_usage()
print(f"Memory after cleanup: {initial_memory:.2f} MB")

In [None]:
# Configure 8-bit quantization
bnb_config_8bit = BitsAndBytesConfig(
    load_in_8bit=True  # Load model in 8-bit precision
)

# Load model with INT8 quantization
print("Loading model with INT8 quantization...")
int8_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config_8bit,
    device_map="auto"
)

# Calculate INT8 model size and memory usage
int8_params = count_parameters(int8_model)
int8_size = calculate_model_size(int8_model, 8)  # 8 bits per parameter
int8_memory = get_memory_usage() - initial_memory

print(f"INT8 model parameters: {int8_params:,}")
print(f"INT8 model size: {int8_size:.2f} MB")
print(f"INT8 model memory usage: {int8_memory:.2f} MB")

## 6. Reload INT4 Model for Comparison

In [None]:
# Clear memory before reloading the INT4 model
del int8_model
gc.collect()
torch.cuda.empty_cache() if torch.cuda.is_available() else None

# Reset memory baseline
initial_memory = get_memory_usage()
print(f"Memory after cleanup: {initial_memory:.2f} MB")

# Reload INT4 model
print("Reloading model with INT4 quantization...")
int4_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

## 7. Reload FP16 Model for Comparison

In [None]:
# Clear memory before reloading the FP16 model
del int4_model
gc.collect()
torch.cuda.empty_cache() if torch.cuda.is_available() else None

# Reset memory baseline
initial_memory = get_memory_usage()
print(f"Memory after cleanup: {initial_memory:.2f} MB")

# Reload FP16 model
print("Reloading model in FP16...")
fp16_model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float16
).to(device)

## 8. Comparing Inference Latency

Let's measure and compare the inference latency of the different precision models.

In [None]:
# Define a test prompt
prompt = "Artificial intelligence will transform the future by"

# Measure FP16 inference time
fp16_time = measure_inference_time(fp16_model, tokenizer, prompt)
print(f"FP16 average inference time: {fp16_time:.4f} seconds")

# Clear memory
del fp16_model
gc.collect()
torch.cuda.empty_cache() if torch.cuda.is_available() else None

In [None]:
# Load INT8 model
int8_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config_8bit,
    device_map="auto"
)

# Measure INT8 inference time
int8_time = measure_inference_time(int8_model, tokenizer, prompt)
print(f"INT8 average inference time: {int8_time:.4f} seconds")

# Clear memory
del int8_model
gc.collect()
torch.cuda.empty_cache() if torch.cuda.is_available() else None

In [None]:
# Load INT4 model
int4_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Measure INT4 inference time
int4_time = measure_inference_time(int4_model, tokenizer, prompt)
print(f"INT4 average inference time: {int4_time:.4f} seconds")

## 9. Comparing Text Generation Quality

Let's compare the quality of text generation across different precision models.

In [None]:
# Generate text with the INT4 model
inputs = tokenizer(prompt, return_tensors="pt").to(int4_model.device)

with torch.no_grad():
    outputs = int4_model.generate(
        **inputs,
        max_length=100,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        num_return_sequences=1
    )

# Decode the generated text
int4_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated text with INT4 model:")
print(int4_text)

## 10. Comparing Results

Let's compile and visualize our results.

In [None]:
# Compile results
results = {
    "Precision": ["FP16", "INT8", "INT4"],
    "Model Size (MB)": [fp16_size, int8_size, int4_size],
    "Memory Usage (MB)": [fp16_memory, int8_memory, int4_memory],
    "Inference Time (s)": [fp16_time, int8_time, int4_time],
    "Size Reduction": ["1.0x", f"{fp16_size/int8_size:.2f}x", f"{fp16_size/int4_size:.2f}x"],
    "Memory Reduction": ["1.0x", f"{fp16_memory/int8_memory:.2f}x", f"{fp16_memory/int4_memory:.2f}x"],
    "Speed Improvement": ["1.0x", f"{fp16_time/int8_time:.2f}x", f"{fp16_time/int4_time:.2f}x"]
}

# Display results as a table
import pandas as pd
results_df = pd.DataFrame(results)
results_df

In [None]:
# Visualize the results
import matplotlib.pyplot as plt

# Set up the figure
plt.figure(figsize=(15, 5))

# Plot model size comparison
plt.subplot(1, 3, 1)
plt.bar(results["Precision"], results["Model Size (MB)"], color=["blue", "green", "red"])
plt.title("Model Size Comparison")
plt.ylabel("Size (MB)")
plt.grid(axis="y", alpha=0.3)

# Plot memory usage comparison
plt.subplot(1, 3, 2)
plt.bar(results["Precision"], results["Memory Usage (MB)"], color=["blue", "green", "red"])
plt.title("Memory Usage Comparison")
plt.ylabel("Memory (MB)")
plt.grid(axis="y", alpha=0.3)

# Plot inference time comparison
plt.subplot(1, 3, 3)
plt.bar(results["Precision"], results["Inference Time (s)"], color=["blue", "green", "red"])
plt.title("Inference Time Comparison")
plt.ylabel("Time (seconds)")
plt.grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.show()

## Conclusion

In this notebook, we've explored INT4 quantization using the bitsandbytes library and compared it with FP16 and INT8 models. We've seen that:

1. INT4 quantization provides significant memory savings compared to FP16 and INT8 models
2. The inference speed improvements can vary depending on hardware support for INT4 operations
3. The quality of text generation may be affected, but often remains acceptable for many use cases

Key takeaways:
- INT4 quantization can reduce model size by approximately 4x compared to FP16
- The memory usage reduction can enable running larger models on limited hardware
- The trade-off between quality and efficiency needs to be evaluated for each specific use case

In the next part, we'll explore more advanced quantization techniques like AWQ and GPTQ.