In [47]:
import torch
import torch.nn as nn
from copy import deepcopy
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [1]:
# Load the model and tokenizer
model_name = "microsoft/phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True)

# Prepare input
prompt = "Write a short story about a robot learning to paint:"
inputs = tokenizer(prompt, return_tensors="pt")

# Generate text
outputs = model.generate(**inputs, max_length=100)
generated_text = tokenizer.decode(outputs[0], cache_position=True)

print(generated_text)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You are not running the flash-attention implementation, expect numerical differences.


Write a short story about a robot learning to paint:


Once upon a time in a bustling city filled with the hum of technology, there was a robot named Artie. Artie was no ordinary machine; he was designed with the ability to learn and adapt. His creators had programmed him with a passion for art, but Artie had yet to discover his own style.


One day, while wandering through the city's art district, Artie


## Part-1

In [2]:
from helper import get_model_size, W8A16LinearLayer, replace_linear_with_target_and_quantize

In [10]:
print("Model before:\n\n", model)

Model before:

 Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_features=3072, o

In [9]:
def get_model_size(model):
    """
    Calculate the size of the model in megabytes.
    Takes into account different parameter dtypes.
    """
    total_bytes = 0
    for param in model.parameters():
        # Get bytes per element based on dtype
        if param.dtype == torch.float32:
            bytes_per_element = 4
        elif param.dtype == torch.float16:
            bytes_per_element = 2
        elif param.dtype == torch.int8:
            bytes_per_element = 1
        else:
            bytes_per_element = param.element_size()  # fallback for other dtypes
            
        # Calculate bytes for this parameter
        param_bytes = param.numel() * bytes_per_element
        total_bytes += param_bytes
        
    # Convert to megabytes
    size_mb = total_bytes / (1024 ** 2)
    return size_mb

def get_detailed_model_size(model):
    """
    Calculate and print detailed size information for each parameter type.
    """
    size_dict = {
        'float32': 0,
        'float16': 0,
        'int8': 0,
        'other': 0
    }
    
    for name, param in model.named_parameters():
        # Calculate size in MB
        if param.dtype == torch.float32:
            size_dict['float32'] += param.numel() * 4 / (1024 * 1024)
        elif param.dtype == torch.float16:
            size_dict['float16'] += param.numel() * 2 / (1024 * 1024)
        elif param.dtype == torch.int8:
            size_dict['int8'] += param.numel() * 1 / (1024 * 1024)
        else:
            size_dict['other'] += param.numel() * param.element_size() / (1024 * 1024)
    
    return size_dict

get_model_size(model), get_detailed_model_size(model)  

(14576.26171875,
 {'float32': 14576.26171875, 'float16': 0, 'int8': 0, 'other': 0})

In [11]:
replace_linear_with_target_and_quantize(model, 
                                        W8A16LinearLayer, ["lm_head"])

In [12]:
print("Model after:\n\n", model)

Model after:

 Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): W8A16LinearLayer()
          (qkv_proj): W8A16LinearLayer()
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): W8A16LinearLayer()
          (down_proj): W8A16LinearLayer()
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_features=3072, out_features=32064, bias=False)
)


In [13]:
get_model_size(model), get_detailed_model_size(model)  

(752.26171875, {'float32': 752.26171875, 'float16': 0, 'int8': 0, 'other': 0})

In [14]:
# Generate text
outputs = model.generate(**inputs, max_length=100)
generated_text = tokenizer.decode(outputs[0], cache_position=True)

print(generated_text)

Write a short story about a robot learning to paint:


Once upon a time, in a bustling city filled with the hum of technology, there was a robot named Artie. Artie was no ordinary machine; he was designed with the latest AI, capable of learning and adapting. His creators had programmed him with a passion for art, but Artie had yet to discover his own style.


One day, while wandering through the city'


In [19]:
for name, child in model.named_children():
    print(name, ' --> ',child)

model  -->  Phi3Model(
  (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
  (embed_dropout): Dropout(p=0.0, inplace=False)
  (layers): ModuleList(
    (0-31): 32 x Phi3DecoderLayer(
      (self_attn): Phi3Attention(
        (o_proj): W8A16LinearLayer()
        (qkv_proj): W8A16LinearLayer()
        (rotary_emb): Phi3RotaryEmbedding()
      )
      (mlp): Phi3MLP(
        (gate_up_proj): W8A16LinearLayer()
        (down_proj): W8A16LinearLayer()
        (activation_fn): SiLU()
      )
      (input_layernorm): Phi3RMSNorm()
      (resid_attn_dropout): Dropout(p=0.0, inplace=False)
      (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
      (post_attention_layernorm): Phi3RMSNorm()
    )
  )
  (norm): Phi3RMSNorm()
)
lm_head  -->  Linear(in_features=3072, out_features=32064, bias=False)


In [79]:
model_name = "microsoft/phi-3-mini-4k-instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32,
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [80]:
previous_memory_footprint = model.get_memory_footprint()
print("Footprint of the model in MBs: ", previous_memory_footprint/1e+6)

Footprint of the model in MBs:  15284.318208


In [81]:
print_model_info(model)



Detailed Model Analysis:
--------------------------------------------------
Layer Types:
- Phi3ForCausalLM: 1
- Phi3Model: 1
- Embedding: 1
- Dropout: 65
- ModuleList: 1
- Phi3DecoderLayer: 32
- Phi3Attention: 32
- Linear: 129
- Phi3RotaryEmbedding: 32
- Phi3MLP: 32
- SiLU: 32
- Phi3RMSNorm: 65

Memory Usage by dtype (MB):
- float32: 14576.26 MB
- float16: 0.00 MB
- int8: 0.00 MB
- other: 0.00 MB


In [82]:
get_detailed_model_size(model)

{'float32': 14576.26171875, 'float16': 0.0, 'int8': 0.0, 'other': 0.0}

In [83]:
generator = pipeline(
    "text-generation",
    model=model,
    # torch_dtype="float32",
    tokenizer=tokenizer,
    # trust_remote_code=True
)

# Generate text
prompt = "Write a short story about a robot learning to paint:"
response = generator(
    prompt,
    max_length=100,
    num_return_sequences=1
)

# Print the generated text
print(response[0]['generated_text'])

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Write a short story about a robot learning to paint:


Once upon a time in a bustling city filled with the hum of technology, there was a robot named Artie. Artie was no ordinary machine; he was designed with the ability to learn and adapt. His creators had programmed him with a passion for art, but Artie had yet to discover his own style.


One day, while wandering through the city's art district, Artie


In [55]:
# print(pipe(prompt, max_new_tokens=20, do_sample=False))

In [58]:
# outputs = model.generate(**inputs, max_length=100)
# generated_text = tokenizer.decode(outputs[0], cache_position=True)

# print(generated_text)

In [84]:
for name, param in model.named_parameters():
    print(f"Parameter {name} has dtype {param.dtype}")

Parameter model.embed_tokens.weight has dtype torch.float32
Parameter model.layers.0.self_attn.o_proj.weight has dtype torch.float32
Parameter model.layers.0.self_attn.qkv_proj.weight has dtype torch.float32
Parameter model.layers.0.mlp.gate_up_proj.weight has dtype torch.float32
Parameter model.layers.0.mlp.down_proj.weight has dtype torch.float32
Parameter model.layers.0.input_layernorm.weight has dtype torch.float32
Parameter model.layers.0.post_attention_layernorm.weight has dtype torch.float32
Parameter model.layers.1.self_attn.o_proj.weight has dtype torch.float32
Parameter model.layers.1.self_attn.qkv_proj.weight has dtype torch.float32
Parameter model.layers.1.mlp.gate_up_proj.weight has dtype torch.float32
Parameter model.layers.1.mlp.down_proj.weight has dtype torch.float32
Parameter model.layers.1.input_layernorm.weight has dtype torch.float32
Parameter model.layers.1.post_attention_layernorm.weight has dtype torch.float32
Parameter model.layers.2.self_attn.o_proj.weight has

In [85]:
replace_linear_with_target_and_quantize(model, 
                                        W8A16LinearLayer,
                                        []
                                        ) # quanitize full model

In [86]:
# Generate text
prompt = "Write a short story about a robot learning to paint:"
response = generator(
    prompt,
    max_length=100,
    num_return_sequences=1
)

# Print the generated text
print(response[0]['generated_text'])

Write a short story about a robot learning to paint:


Once upon a time, in a bustling city filled with the hum of technology, there was a robot named Artie. Artie was no ordinary robot; he was designed with the latest AI, capable of learning and adapting. His creators had programmed him with a passion for art, but Artie had yet to discover his own style.


One day, while wandering through the city'


In [87]:
for name, param in model.named_parameters():
    print(f"Parameter {name} has dtype {param.dtype}")

Parameter model.embed_tokens.weight has dtype torch.float32
Parameter model.layers.0.input_layernorm.weight has dtype torch.float32
Parameter model.layers.0.post_attention_layernorm.weight has dtype torch.float32
Parameter model.layers.1.input_layernorm.weight has dtype torch.float32
Parameter model.layers.1.post_attention_layernorm.weight has dtype torch.float32
Parameter model.layers.2.input_layernorm.weight has dtype torch.float32
Parameter model.layers.2.post_attention_layernorm.weight has dtype torch.float32
Parameter model.layers.3.input_layernorm.weight has dtype torch.float32
Parameter model.layers.3.post_attention_layernorm.weight has dtype torch.float32
Parameter model.layers.4.input_layernorm.weight has dtype torch.float32
Parameter model.layers.4.post_attention_layernorm.weight has dtype torch.float32
Parameter model.layers.5.input_layernorm.weight has dtype torch.float32
Parameter model.layers.5.post_attention_layernorm.weight has dtype torch.float32
Parameter model.layers

In [88]:
def check_model_quantization(model):
    """
    Improved function to verify model quantization status
    """
    def count_quantized_layers():
        quantized_count = 0
        total_count = 0
        
        for name, module in model.named_modules():
            if isinstance(module, (torch.nn.Linear, W8A16LinearLayer)):
                total_count += 1
                if isinstance(module, W8A16LinearLayer):
                    quantized_count += 1
                print(f"Layer {name}: Type = {type(module).__name__}")
        
        return quantized_count, total_count
    
    def check_memory_usage():
        param_size = 0
        for param in model.parameters():
            param_size += param.nelement() * param.element_size()
        buffer_size = 0
        for buffer in model.buffers():
            buffer_size += buffer.nelement() * buffer.element_size()
            
        size_all_mb = (param_size + buffer_size) / 1024**2
        return size_all_mb
    
    # Run checks
    quantized_count, total_count = count_quantized_layers()
    model_size = check_memory_usage()
    
    print(f"\nQuantization Summary:")
    print(f"- Quantized layers (W8A16LinearLayer): {quantized_count}")
    print(f"- Total linear layers: {total_count}")
    print(f"- Quantization ratio: {(quantized_count/total_count)*100:.2f}%")
    print(f"- Model size: {model_size:.2f} MB")
    
    # Consider the model quantized if most layers are quantized
    is_quantized = (quantized_count / total_count) > 0.5
    return is_quantized

# Usage
is_quantized = check_model_quantization(model)
print(f"\nModel quantization status: {'Quantized' if is_quantized else 'Not quantized'}")

Layer model.layers.0.self_attn.o_proj: Type = W8A16LinearLayer
Layer model.layers.0.self_attn.qkv_proj: Type = W8A16LinearLayer
Layer model.layers.0.mlp.gate_up_proj: Type = W8A16LinearLayer
Layer model.layers.0.mlp.down_proj: Type = W8A16LinearLayer
Layer model.layers.1.self_attn.o_proj: Type = W8A16LinearLayer
Layer model.layers.1.self_attn.qkv_proj: Type = W8A16LinearLayer
Layer model.layers.1.mlp.gate_up_proj: Type = W8A16LinearLayer
Layer model.layers.1.mlp.down_proj: Type = W8A16LinearLayer
Layer model.layers.2.self_attn.o_proj: Type = W8A16LinearLayer
Layer model.layers.2.self_attn.qkv_proj: Type = W8A16LinearLayer
Layer model.layers.2.mlp.gate_up_proj: Type = W8A16LinearLayer
Layer model.layers.2.mlp.down_proj: Type = W8A16LinearLayer
Layer model.layers.3.self_attn.o_proj: Type = W8A16LinearLayer
Layer model.layers.3.self_attn.qkv_proj: Type = W8A16LinearLayer
Layer model.layers.3.mlp.gate_up_proj: Type = W8A16LinearLayer
Layer model.layers.3.mlp.down_proj: Type = W8A16LinearLa

In [89]:
get_detailed_model_size(model)

{'float32': 380.514892578125, 'float16': 0.0, 'int8': 3549.9375, 'other': 0.0}

In [90]:
def get_detailed_model_size(model):
    """
    Get detailed memory usage by dtype, including buffers
    """
    def dtype_size_bytes(dtype):
        if dtype == torch.int8:
            return 1
        elif dtype in [torch.float16, torch.bfloat16]:
            return 2
        elif dtype == torch.float32:
            return 4
        else:
            return 8  # default for other types

    size_dict = {
        'float32': 0,
        'float16': 0,
        'int8': 0,
        'other': 0
    }
    
    # Check parameters
    for name, param in model.named_parameters():
        dtype = param.dtype
        size_bytes = param.nelement() * dtype_size_bytes(dtype)
        
        if dtype == torch.float32:
            size_dict['float32'] += size_bytes
        elif dtype == torch.float16:
            size_dict['float16'] += size_bytes
        elif dtype == torch.int8:
            size_dict['int8'] += size_bytes
        else:
            size_dict['other'] += size_bytes
            
    # Check buffers (important for quantized models)
    for name, buffer in model.named_buffers():
        dtype = buffer.dtype
        size_bytes = buffer.nelement() * dtype_size_bytes(dtype)
        
        if dtype == torch.float32:
            size_dict['float32'] += size_bytes
        elif dtype == torch.float16:
            size_dict['float16'] += size_bytes
        elif dtype == torch.int8:
            size_dict['int8'] += size_bytes
        else:
            size_dict['other'] += size_bytes
    
    # Convert to MB
    for key in size_dict:
        size_dict[key] = size_dict[key] / (1024 * 1024)
        
    return size_dict

def print_model_info(model):
    """
    Print detailed model information including quantization
    """
    print("\nDetailed Model Analysis:")
    print("-" * 50)
    
    # Count layer types
    layer_types = {}
    for name, module in model.named_modules():
        module_type = type(module).__name__
        if module_type not in layer_types:
            layer_types[module_type] = 0
        layer_types[module_type] += 1
    
    print("Layer Types:")
    for layer_type, count in layer_types.items():
        print(f"- {layer_type}: {count}")
    
    # Get memory usage by dtype
    memory_usage = get_detailed_model_size(model)
    print("\nMemory Usage by dtype (MB):")
    for dtype, size in memory_usage.items():
        print(f"- {dtype}: {size:.2f} MB")
    
    # Check specific W8A16LinearLayer properties
    w8a16_layers = [m for m in model.modules() if isinstance(m, W8A16LinearLayer)]
    if w8a16_layers:
        sample_layer = w8a16_layers[0]
        print("\nW8A16LinearLayer Properties:")
        print(f"- int8_weights dtype: {sample_layer.int8_weights.dtype}")
        print(f"- scales dtype: {sample_layer.scales.dtype}")
        if sample_layer.bias is not None:
            print(f"- bias dtype: {sample_layer.bias.dtype}")

# Usage
print_model_info(model)


Detailed Model Analysis:
--------------------------------------------------
Layer Types:
- Phi3ForCausalLM: 1
- Phi3Model: 1
- Embedding: 1
- Dropout: 65
- ModuleList: 1
- Phi3DecoderLayer: 32
- Phi3Attention: 32
- W8A16LinearLayer: 129
- Phi3RotaryEmbedding: 32
- Phi3MLP: 32
- SiLU: 32
- Phi3RMSNorm: 65

Memory Usage by dtype (MB):
- float32: 380.51 MB
- float16: 0.00 MB
- int8: 3549.94 MB
- other: 0.00 MB

W8A16LinearLayer Properties:
- int8_weights dtype: torch.int8
- scales dtype: torch.float32


In [91]:
new_memory_footprint = model.get_memory_footprint()
print("Footprint of the model in MBs: ", new_memory_footprint/1e+6)

Footprint of the model in MBs:  4121.378048


: 

## Part-2

In [1]:
import gc
import time
import torch
import psutil
import numpy as np
import transformers
import bitsandbytes as bnb
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.nn.utils.parametrize import remove_parametrizations

In [24]:
from tqdm.auto import tqdm

In [31]:
def check_cuda_availability():
    """Check CUDA availability and initialize it"""
    if not torch.cuda.is_available():
        raise RuntimeError("CUDA is not available. Please check your GPU installation.")
    
    # Initialize CUDA
    torch.cuda.init()
    device = torch.device("cuda")
    
    # Print CUDA information
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")
    return device

def get_memory_usage():
    """Get current memory usage in GB"""
    process = psutil.Process()
    cpu_memory = process.memory_info().rss / (1024 * 1024 * 1024)
    
    if torch.cuda.is_available():
        gpu_memory = torch.cuda.memory_allocated() / (1024 * 1024 * 1024)
        return cpu_memory, gpu_memory
    return cpu_memory, 0

def measure_inference_latency(model, tokenizer, input_text, num_runs=10):
    """Measure average inference latency"""
    try:
        inputs = tokenizer(input_text, return_tensors="pt")
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        
        # Warmup
        # for _ in range(5):
        #     with torch.no_grad():
        #         _ = model.generate(**inputs, max_new_tokens=20)
        
        # Measure latency
        latencies = []
        for _ in tqdm(range(num_runs), desc="Measuring Latency", leave=False):
            torch.cuda.synchronize()  # Ensure CUDA operations are complete
            start_time = time.time()
            with torch.no_grad():
                _ = model.generate(**inputs, max_new_tokens=20)
            torch.cuda.synchronize()  # Ensure CUDA operations are complete
            latencies.append(time.time() - start_time)
        
        return np.mean(latencies)
    except Exception as e:
        print(f"Error during inference: {str(e)}")
        raise

def compute_perplexity(model, tokenizer, dataset, max_samples=3000):
    """Compute perplexity on dataset"""
    try:
        model.eval()
        total_loss = 0
        total_length = 0
        
        for i, sample in tqdm(enumerate(dataset), 'Calculating Perplexity', leave=False):
            if i >= max_samples:
                break
                
            inputs = tokenizer(sample['text'], return_tensors='pt', truncation=True, max_length=512)
            inputs = {k: v.to(model.device) for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = model(**inputs, labels=inputs['input_ids'])
                loss = outputs.loss
                
            total_loss += loss.item() * inputs['input_ids'].size(1)
            total_length += inputs['input_ids'].size(1)
        
        return torch.exp(torch.tensor(total_loss / total_length))
    except Exception as e:
        print(f"Error computing perplexity: {str(e)}")
        raise

def load_model(model_name, quantization=None):
    """Load model with specified quantization"""
    try:
        if quantization == "8bit":
            return AutoModelForCausalLM.from_pretrained(
                model_name,
                load_in_8bit=True,
                device_map="auto",
                torch_dtype=torch.float16
            )
        elif quantization == "4bit":
            return AutoModelForCausalLM.from_pretrained(
                model_name,
                load_in_4bit=True,
                device_map="auto",
                torch_dtype=torch.float16
            )
        else:
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16
            )
            # model.to("cuda")
            return model
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        raise

In [7]:
# def main():
#     # Load model and tokenizer
#     model_name = "microsoft/phi-3-mini-4k-instruct"
#     print(f"Loading {model_name}...")
    
#     # Load original model
#     tokenizer = AutoTokenizer.from_pretrained(model_name)
#     original_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
#     original_model.to('cuda')
    
#     # Load dataset
#     print("Loading Wikipedia dataset...")
#     # dataset = load_dataset("wikipedia", "20220301.en", split="train")
#     dataset = load_dataset("ptb_text_only", "penn_treebank", split="test", trust_remote_code=True)
#     # Ensure dataset size is at least 3000 points
#     if len(dataset) > 3000:
#         dataset = dataset.select(range(3000))
#     else:
#         print(f"Warning: Dataset size ({len(dataset)}) is less than 3000 points")
    
#     # Benchmark original model
#     print("\nBenchmarking original model (FP16)...")
#     original_memory = get_memory_usage()
#     original_latency = measure_inference_latency(
#         original_model, 
#         tokenizer, 
#         "The quick brown fox jumps over the lazy dog"
#     )
#     original_perplexity = compute_perplexity(original_model, tokenizer, dataset)
    
#     # Clear memory
#     del original_model
#     gc.collect()
#     torch.cuda.empty_cache()
    
#     # Load and benchmark 8-bit model
#     print("\nLoading and benchmarking 8-bit model...")
#     model_8bit = AutoModelForCausalLM.from_pretrained(
#         model_name,
#         load_in_8bit=True,
#         device_map="auto"
#     )
    
#     memory_8bit = get_memory_usage()
#     latency_8bit = measure_inference_latency(
#         model_8bit, 
#         tokenizer, 
#         "The quick brown fox jumps over the lazy dog"
#     )
#     perplexity_8bit = compute_perplexity(model_8bit, tokenizer, dataset)
    
#     # Clear memory
#     del model_8bit
#     gc.collect()
#     torch.cuda.empty_cache()
    
#     # Load and benchmark 4-bit model
#     print("\nLoading and benchmarking 4-bit model...")
#     model_4bit = AutoModelForCausalLM.from_pretrained(
#         model_name,
#         load_in_4bit=True,
#         device_map="auto"
#     )
    
#     memory_4bit = get_memory_usage()
#     latency_4bit = measure_inference_latency(
#         model_4bit, 
#         tokenizer, 
#         "The quick brown fox jumps over the lazy dog"
#     )
#     perplexity_4bit = compute_perplexity(model_4bit, tokenizer, dataset)
    
#     # Print results
#     print("\n=== Results ===")
#     print("\nMemory Usage (GB):")
#     print(f"Original (FP16): {original_memory:.2f}")
#     print(f"8-bit: {memory_8bit:.2f}")
#     print(f"4-bit: {memory_4bit:.2f}")
    
#     print("\nInference Latency (seconds):")
#     print(f"Original (FP16): {original_latency:.4f}")
#     print(f"8-bit: {latency_8bit:.4f}")
#     print(f"4-bit: {latency_4bit:.4f}")
    
#     print("\nPerplexity:")
#     print(f"Original (FP16): {original_perplexity:.2f}")
#     print(f"8-bit: {perplexity_8bit:.2f}")
#     print(f"4-bit: {perplexity_4bit:.2f}")

In [32]:
def main():
    try:
        # Check CUDA availability and initialize
        device = check_cuda_availability()
        
        # Load model and tokenizer
        model_name = "microsoft/phi-3-mini-4k-instruct"
        print(f"Loading {model_name}...")
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        # Load dataset
        print("Loading Wikipedia dataset...")
        
        # Test input
        test_input = "The quick brown fox jumps over the lazy dog"
        # Load PTB dataset
        dataset = load_dataset("ptb_text_only", "penn_treebank", split="test", trust_remote_code=True)
        
        # Ensure dataset size is at least 3000 points
        if len(dataset) > 3000:
            dataset = dataset.select(range(3000))
        else:
            print(f"Warning: Dataset size ({len(dataset)}) is less than 3000 points")
        
        # Dictionary to store results
        results = {}
        
        # Test different quantization levels
        for model_type in ["original"]: # "8bit", "4bit"
            print(f"\nTesting {model_type} model...")
            
            # Load appropriate model
            model = load_model(model_name, model_type if model_type != "original" else None)
            
            # Measure metrics
            cpu_mem, gpu_mem = get_memory_usage()
            print('memory uasge computed.')
            latency = measure_inference_latency(model, tokenizer, test_input)
            print(f"\tCalculating perplexity for {model_type} model...", end='\r')
            perplexity = compute_perplexity(model, tokenizer, dataset)
            print(f"\tCalculated perplexity for {model_type} model               ")
            
            # Store results
            results[model_type] = {
                "cpu_memory": cpu_mem,
                "gpu_memory": gpu_mem,
                "latency": latency,
                "perplexity": perplexity
            }
            
            # Clean up
            del model
            gc.collect()
            torch.cuda.empty_cache()
            
        # Print results
        print("\n=== Results ===")
        for model_type, metrics in results.items():
            print(f"\n{model_type.upper()} MODEL:")
            print(f"CPU Memory: {metrics['cpu_memory']:.2f} GB")
            print(f"GPU Memory: {metrics['gpu_memory']:.2f} GB")
            print(f"Inference Latency: {metrics['latency']:.4f} seconds")
            print(f"Perplexity: {metrics['perplexity']:.2f}")
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        raise

In [33]:
gc.collect()
torch.cuda.empty_cache()

In [30]:
main()

Using GPU: NVIDIA GeForce RTX 2080 Ti
CUDA Version: 12.1
Loading microsoft/phi-3-mini-4k-instruct...
Loading Wikipedia dataset...

Testing original model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 