In [4]:
!pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-win_amd64.whl.metadata (10 kB)
Downloading bitsandbytes-0.46.0-py3-none-win_amd64.whl (66.5 MB)
   ---------------------------------------- 0.0/66.5 MB ? eta -:--:--
   ---------------------------------------- 0.3/66.5 MB ? eta -:--:--
    --------------------------------------- 1.3/66.5 MB 3.9 MB/s eta 0:00:17
   - -------------------------------------- 2.1/66.5 MB 3.9 MB/s eta 0:00:17
   - -------------------------------------- 2.9/66.5 MB 3.8 MB/s eta 0:00:17
   -- ------------------------------------- 3.7/66.5 MB 3.8 MB/s eta 0:00:17
   -- ------------------------------------- 4.5/66.5 MB 3.8 MB/s eta 0:00:17
   --- ------------------------------------ 5.2/66.5 MB 3.8 MB/s eta 0:00:16
   --- ------------------------------------ 6.0/66.5 MB 3.8 MB/s eta 0:00:16
   ---- ----------------------------------- 7.1/66.5 MB 3.8 MB/s eta 0:00:16
   ---- ----------------------------------- 7.9/66.5 MB 3.8 MB/s eta 0:00:16
   -

In [1]:
import torch
import time
import psutil
import os
from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    BitsAndBytesConfig,
    pipeline
)
from evaluate import load
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple
import gc

In [None]:
# Configuration
MODEL_NAME = "meta-llama/Llama-3.2-1B"
DATASET_NAME = "squad"  # Using SQuAD for question answering
NUM_SAMPLES = 50  # Keep small for quick testing
MAX_NEW_TOKENS = 100
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
class ModelBenchmark:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
        self.results = []
        
    def get_model_size_mb(self, model) -> float:
        param_size = 0
        for param in model.parameters():
            param_size += param.nelement() * param.element_size()
        buffer_size = 0
        for buffer in model.buffers():
            buffer_size += buffer.nelement() * buffer.element_size()
        return (param_size + buffer_size) / (1024 ** 2)
    
    def load_model_with_precision(self, precision: str):
        print(f"\n🔄 Loading model in {precision} precision...")
        
        if precision == "float32":
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_NAME,
                torch_dtype=torch.float32,
                device_map="auto" if DEVICE == "cuda" else None
            )
        elif precision == "float16":
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_NAME,
                torch_dtype=torch.float16,
                device_map="auto" if DEVICE == "cuda" else None
            )
        elif precision == "bfloat16":
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_NAME,
                torch_dtype=torch.bfloat16,
                device_map="auto" if DEVICE == "cuda" else None
            )
        elif precision == "int8":
            quantization_config = BitsAndBytesConfig(
                load_in_8bit=True,
                llm_int8_threshold=6.0
            )
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_NAME,
                quantization_config=quantization_config,
                device_map="auto" if DEVICE == "cuda" else None
            )
        
        return model
    
    def prepare_dataset(self) -> List[str]:
        print("📚 Loading SQuAD dataset...")
        dataset = load_dataset(DATASET_NAME, split="validation")
        
        prompts = []
        for i, example in enumerate(dataset):
            if i >= NUM_SAMPLES:
                break
            
            prompt = f"Context: {example['context']}\nQuestion: {example['question']}\nAnswer:"
            prompts.append(prompt)
        
        return prompts
    
    def benchmark_inference_time(self, model, prompts: List[str]) -> float:
        print("⏱️  Measuring inference time...")
        
        times = []
        model.eval()
        
        with torch.no_grad():
            for prompt in prompts[:10]:  # Use subset for timing
                inputs = self.tokenizer(
                    prompt, 
                    return_tensors="pt", 
                    truncation=True, 
                    max_length=512
                )
                
                if DEVICE == "cuda":
                    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
                
                start_time = time.time()
                
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=MAX_NEW_TOKENS,
                    do_sample=False,
                    pad_token_id=self.tokenizer.eos_token_id
                )
                
                end_time = time.time()
                times.append(end_time - start_time)
        
        return np.mean(times)
    
    def evaluate_quality(self, model, prompts: List[str]) -> Dict:
        print("📊 Evaluating model quality...")
        
        model.eval()
        total_loss = 0
        valid_samples = 0
        
        with torch.no_grad():
            for prompt in prompts[:20]:  # Use subset for evaluation
                try:
                    inputs = self.tokenizer(
                        prompt, 
                        return_tensors="pt", 
                        truncation=True, 
                        max_length=512
                    )
                    
                    if DEVICE == "cuda":
                        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
                    
                    outputs = model(**inputs, labels=inputs["input_ids"])
                    total_loss += outputs.loss.item()
                    valid_samples += 1
                    
                except Exception as e:
                    print(f"⚠️  Skipping sample due to error: {e}")
                    continue
        
        avg_loss = total_loss / valid_samples if valid_samples > 0 else float('inf')
        perplexity = np.exp(avg_loss)
        
        return {
            "average_loss": avg_loss,
            "perplexity": perplexity,
            "valid_samples": valid_samples
        }
    
    def run_benchmark(self, precision: str, prompts: List[str]) -> Dict:
        print(f"\n{'='*50}")
        print(f"🧪 TESTING {precision.upper()} PRECISION")
        print(f"{'='*50}")
        
        # Clear GPU memory
        if DEVICE == "cuda":
            torch.cuda.empty_cache()
        gc.collect()
        
        # Load model
        start_load_time = time.time()
        model = self.load_model_with_precision(precision)
        load_time = time.time() - start_load_time
        
        # Calculate model size
        model_size_mb = self.get_model_size_mb(model)
        
        # Measure inference time
        avg_inference_time = self.benchmark_inference_time(model, prompts)
        
        # Evaluate quality
        quality_metrics = self.evaluate_quality(model, prompts)
        
        # Memory usage
        memory_usage = psutil.Process().memory_info().rss / (1024 ** 2)
        
        result = {
            "precision": precision,
            "model_size_mb": model_size_mb,
            "load_time_sec": load_time,
            "avg_inference_time_sec": avg_inference_time,
            "memory_usage_mb": memory_usage,
            **quality_metrics
        }
        
        # Clean up
        del model
        if DEVICE == "cuda":
            torch.cuda.empty_cache()
        gc.collect()
        
        return result
    
    def run_all_benchmarks(self):
        print("🚀 Starting Llama 3.2-1B Precision Comparison")
        print(f"Device: {DEVICE}")
        print(f"Samples: {NUM_SAMPLES}")
        
        # Prepare dataset
        prompts = self.prepare_dataset()
        
        # Test each precision
        precisions = ["float32", "float16", "bfloat16", "int8"]
        
        for precision in precisions:
            try:
                result = self.run_benchmark(precision, prompts)
                self.results.append(result)
                print(f"✅ {precision} completed successfully")
            except Exception as e:
                print(f"❌ {precision} failed: {e}")
                continue
        
        self.display_results()
    
    def display_results(self):
        if not self.results:
            print("❌ No results to display")
            return
        
        df = pd.DataFrame(self.results)
        
        print(f"\n{'='*80}")
        print("📊 LLAMA 3.2-1B PRECISION COMPARISON RESULTS")
        print(f"{'='*80}")
        
        # Storage Impact
        print("\n🗄️  STORAGE REQUIREMENTS:")
        print(f"{'Precision':<12} {'Size (MB)':<12} {'vs Float32':<15}")
        print("-" * 40)
        float32_size = df[df['precision'] == 'float32']['model_size_mb'].iloc[0] if 'float32' in df['precision'].values else None
        
        for _, row in df.iterrows():
            size_mb = row['model_size_mb']
            reduction = f"-{((float32_size - size_mb) / float32_size * 100):.1f}%" if float32_size else "N/A"
            print(f"{row['precision']:<12} {size_mb:<12.1f} {reduction:<15}")
        
        # Inference Time Impact
        print("\n⏱️  INFERENCE TIME:")
        print(f"{'Precision':<12} {'Time (sec)':<12} {'vs Float32':<15}")
        print("-" * 40)
        float32_time = df[df['precision'] == 'float32']['avg_inference_time_sec'].iloc[0] if 'float32' in df['precision'].values else None
        
        for _, row in df.iterrows():
            time_sec = row['avg_inference_time_sec']
            speedup = f"{(float32_time / time_sec):.2f}x faster" if float32_time and time_sec > 0 else "N/A"
            print(f"{row['precision']:<12} {time_sec:<12.3f} {speedup:<15}")
        
        # Performance Impact
        print("\n📈 MODEL PERFORMANCE:")
        print(f"{'Precision':<12} {'Perplexity':<12} {'vs Float32':<15}")
        print("-" * 40)
        float32_ppl = df[df['precision'] == 'float32']['perplexity'].iloc[0] if 'float32' in df['precision'].values else None
        
        for _, row in df.iterrows():
            ppl = row['perplexity']
            degradation = f"+{((ppl - float32_ppl) / float32_ppl * 100):.1f}%" if float32_ppl else "N/A"
            print(f"{row['precision']:<12} {ppl:<12.2f} {degradation:<15}")
        
        # Summary and Recommendation
        print(f"\n{'='*80}")
        print("🎯 QUANTIZATION ANALYSIS & RECOMMENDATION")
        print(f"{'='*80}")
        
        # Find best balance
        if len(df) > 1:
            # Normalize metrics (lower is better for size and time, perplexity)
            df_norm = df.copy()
            df_norm['size_score'] = 1 - (df_norm['model_size_mb'] / df_norm['model_size_mb'].max())
            df_norm['time_score'] = 1 - (df_norm['avg_inference_time_sec'] / df_norm['avg_inference_time_sec'].max())
            df_norm['quality_score'] = 1 - ((df_norm['perplexity'] - df_norm['perplexity'].min()) / 
                                          (df_norm['perplexity'].max() - df_norm['perplexity'].min()) if df_norm['perplexity'].max() != df_norm['perplexity'].min() else 0)
            
            # Combined score (equal weights)
            df_norm['combined_score'] = (df_norm['size_score'] + df_norm['time_score'] + df_norm['quality_score']) / 3
            best_precision = df_norm.loc[df_norm['combined_score'].idxmax(), 'precision']
            
            print(f"🏆 RECOMMENDED PRECISION: {best_precision.upper()}")
            
            best_row = df[df['precision'] == best_precision].iloc[0]
            print(f"   • Model Size: {best_row['model_size_mb']:.1f} MB")
            print(f"   • Inference Time: {best_row['avg_inference_time_sec']:.3f} sec")
            print(f"   • Perplexity: {best_row['perplexity']:.2f}")
            
            print(f"\n💡 INSIGHTS:")
            if 'int8' in df['precision'].values:
                int8_row = df[df['precision'] == 'int8'].iloc[0]
                if float32_size:
                    size_reduction = (float32_size - int8_row['model_size_mb']) / float32_size * 100
                    print(f"   • INT8 quantization saves {size_reduction:.1f}% storage space")
                if float32_time:
                    speed_improvement = float32_time / int8_row['avg_inference_time_sec']
                    print(f"   • INT8 quantization provides {speed_improvement:.2f}x speedup")
            
            print(f"   • Quantization IS worth it for deployment scenarios prioritizing:")
            print(f"     - Reduced memory usage")
            print(f"     - Faster inference")
            print(f"     - Lower computational costs")
            print(f"   • Use float32 only when maximum precision is critical")

In [None]:
def main():
    benchmark = ModelBenchmark()
    benchmark.run_all_benchmarks()

if __name__ == "__main__":
    # Install required packages
    required_packages = [
        "torch", "transformers", "datasets", "evaluate", 
        "bitsandbytes", "accelerate", "psutil", "pandas", "numpy"
    ]
    
    print("📦 Required packages:")
    for package in required_packages:
        print(f"   pip install {package}")
    
    print("\n" + "="*50)
    print("🚀 Starting benchmark...")
    print("="*50)
    
    main()

📦 Required packages:
   pip install torch
   pip install transformers
   pip install datasets
   pip install evaluate
   pip install bitsandbytes
   pip install accelerate
   pip install psutil
   pip install pandas
   pip install numpy

🚀 Starting benchmark...
🚀 Starting Llama 3.2-1B Precision Comparison
Device: cpu
Samples: 50
📚 Loading SQuAD dataset...


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]


🧪 TESTING FLOAT32 PRECISION

🔄 Loading model in float32 precision...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


⏱️  Measuring inference time...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.

📊 Evaluating model quality...
✅ float32 completed successfully

🧪 TESTING FLOAT16 PRECISION

🔄 Loading model in float16 precision...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


⏱️  Measuring inference time...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.

📊 Evaluating model quality...
✅ float16 completed successfully

🧪 TESTING BFLOAT16 PRECISION

🔄 Loading model in bfloat16 precision...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


⏱️  Measuring inference time...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.

📊 Evaluating model quality...
✅ bfloat16 completed successfully

🧪 TESTING INT8 PRECISION

🔄 Loading model in int8 precision...
❌ int8 failed: Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

📊 LLAMA 3.2-1B PRECISION COMPARISON RESULTS

🗄️  STORAGE REQUIREMENTS:
Precision    Size (MB)    vs Float32     
----------------------------------------
float32      4714.3       -0.0%          
float16      2357.1       -50.0%         
bfloat16     2357.1       -50.0%         

⏱️  INFERENCE TIME:
Precision    Time (sec)   vs Float32     
----------------------------------------
float32      15.478       1.00x faster   
float16      14.586       1.06x faster   
bfloat16     15.525       1.00x faster   

📈 MODEL PERFORMANCE:
Precision    Perplexity   vs Float32     
----------------------------------------
float32      4.64         +0.0%          
float16      4.64         +0.1%          
bfloat16     4.63         +-0.2%         
