<!-- ### Pruned Models
1. Wanda 0.3 pruned - https://huggingface.co/Arjun7m/Wanda-pruned-Llama-2-7b-0.3/tree/main
2. Wanda 0.5 pruned - https://huggingface.co/Arjun7m/Wanda-pruned-Llama-2-7b-0.5/tree/

3. SparseGPT 0.3 pruned - https://huggingface.co/Arjun7m/SparseGPT-pruned-Llama-2-7b-0.3/tree/main
4. SparseGPT 0.5 pruned - https://huggingface.co/Rabinovich/sparsegpt_pruned_llama_2_7B

5. Magnitude pruned - https://huggingface.co/Rabinovich/magnitude_pruned_llama_2_7B/tree/main  -->

In [None]:
# !pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git
# !pip install tiktoken protobuf transformers accelerate sentencepiece bitsandbytes
import argparse, time
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig

In [None]:
# Vanilla Llama
# model_name          = "meta-llama/Llama-2-7b-hf" 

# Wanda Pruned Llama
# model_name          = "Rabinovich/wanda_pruned_llama_2_7B" # 50 %
# model_name          = "Arjun7m/Wanda-pruned-Llama-2-7b-0.3" # 30%

# 8-bit Quantization
# quantization_config = BitsAndBytesConfig(load_in_8bit=True)

# 4-bit Quantization
# Update: BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16) --> Check
## Also check: bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True
quantization_config   = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

# model               = AutoModel.from_pretrained(model_name, trust_remote_code=True, quantization_config=quantization_config)

model_name            = "Arjun7m/Wanda-pruned-Llama-2-7b-0.3"
model                 = AutoModel.from_pretrained(model_name, trust_remote_code=True, device_map="auto", quantization_config=quantization_config)
tokenizer             = AutoTokenizer.from_pretrained(model_name)
config                = AutoConfig.from_pretrained(model_name)

Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00,  1.86s/it]


In [None]:
def calculate_transformer_flops(config, seq_length=128):
    global model
    d_model = config.hidden_size
    n_layers = config.num_hidden_layers
    n_heads = config.num_attention_heads
    
    if hasattr(config, 'num_key_value_heads'):
        n_kv_heads = config.num_key_value_heads  # For models with GQA/MQA
    else:
        n_kv_heads = n_heads
    
    d_ff = config.intermediate_size
    vocab_size = config.vocab_size
    
    # Calculate head dimension
    d_head = d_model // n_heads
    
    # Count non-embedding parameters (all parameters except token and position embeddings)
    embedding_params = vocab_size * d_model
    total_params = sum(p.numel() for p in model.parameters())
    non_embedding_params = total_params - embedding_params
    
    # Linear terms (scale with sequence length)
    # Attention: QKV projections
    attn_qkv = 2 * seq_length * 3 * d_model * d_model
    
    # Attention: Output projection 
    attn_output = 2 * seq_length * d_model * d_model
    
    # MLP/FFN layers
    ffn = 2 * seq_length * (d_model * d_ff + d_ff * d_model)
    
    # Quadratic terms (scale with sequence length squared)
    # Attention: QK^T calculations 
    attn_scores = 2 * seq_length * seq_length * d_head * n_heads
    
    # Attention: Softmax (approximately)
    attn_softmax = 3 * n_heads * seq_length * seq_length
    
    # Attention: Attention weights * V 
    attn_weight_v = 2 * seq_length * seq_length * d_head * n_heads
    
    # Calculate per-layer FLOPs
    flops_per_layer = attn_qkv + attn_output + ffn + attn_scores + attn_softmax + attn_weight_v
    
    # Total FLOPs for forward pass (all layers)
    total_flops = n_layers * flops_per_layer
    
    # Add embedding and final classifier FLOPs
    embedding_flops = seq_length * d_model
    classifier_flops = seq_length * d_model * vocab_size
    
    total_flops += embedding_flops + classifier_flops
    
    # FLOPs per token (divide by sequence length)
    flops_per_token = total_flops / seq_length
    
    # Convert to more readable units (GFLOPs)
    total_gflops = total_flops / (10**9)
    
    return total_gflops, flops_per_token / (10**9), total_params

def load_llama_model(model_id, quantization=None):
    """
    Load a LLaMA model with optional quantization
    
    Args:
        model_id: HuggingFace model ID or local path
        quantization: Quantization method (None, "4bit", "8bit")
        
    Returns:
        tuple: (model, tokenizer, config)
    """
    print(f"Loading model from {model_id}...")
    
    # Load configuration to get model info without loading weights
    config = AutoConfig.from_pretrained(model_id)
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # Set up quantization configuration
    model_kwargs = {"trust_remote_code": True}
    
    if torch.cuda.is_available():
        model_kwargs["device_map"] = "auto"
        
        if quantization == "4bit":
            try:
                from transformers import BitsAndBytesConfig
                model_kwargs["quantization_config"] = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_compute_dtype=torch.float16,
                    bnb_4bit_quant_type="nf4",
                    bnb_4bit_use_double_quant=True
                )
            except ImportError:
                print("Warning: bitsandbytes not installed. Using full precision.")
                model_kwargs["torch_dtype"] = torch.float16
                
        elif quantization == "8bit":
            try:
                from transformers import BitsAndBytesConfig
                model_kwargs["quantization_config"] = BitsAndBytesConfig(
                    load_in_8bit=True
                )
            except ImportError:
                print("Warning: bitsandbytes not installed. Using full precision.")
                model_kwargs["torch_dtype"] = torch.float16
        else:
            model_kwargs["torch_dtype"] = torch.float16
            
    model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
    return model, tokenizer, config

def benchmark_inference(model, tokenizer, seq_lengths=[128, 512, 1024, 2048]):
    """
    Benchmark inference speed and calculate MFU (Model FLOPs Utilization)
    
    Args:
        model: The model to benchmark
        tokenizer: The tokenizer
        seq_lengths: List of sequence lengths to test
    """
    device = next(model.parameters()).device
    config = model.config
    
    # Estimate theoretical peak FLOPs for the device
    if torch.cuda.is_available():
        device_name = torch.cuda.get_device_name()
        # Rough estimates for common GPUs (TFLOPs)
        gpu_peak_tflops = {
            "A100": 312,  # A100-80GB
            "A10G": 31.2, # A10G
            "V100": 125,  # V100-32GB
            "T4": 65,     # T4 BF16
            "3090": 142,  # RTX 3090
            "4090": 330,  # RTX 4090
        }
        # Default to a conservative estimate if GPU not known
        peak_tflops = 0
        for gpu_name, tflops in gpu_peak_tflops.items():
            if gpu_name.lower() in device_name.lower():
                peak_tflops = tflops
                break
        if peak_tflops == 0:
            peak_tflops = 30  # Conservative default
            print(f"Unknown GPU: {device_name}. Using default peak performance: {peak_tflops} TFLOPs")
        else:
            print(f"Detected GPU: {device_name}. Estimated peak performance: {peak_tflops} TFLOPs")
    else:
        print("Running on CPU. MFU calculation will not be accurate.")
        peak_tflops = 1
    
    results = []
    
    print("\nBenchmarking inference speed and FLOPs:")
    print(f"{'Seq Length':<12} {'Time (ms)':<12} {'Tokens/sec':<12} {'TFLOPs':<12} {'MFU (%)':<12}")
    print(f"{'-'*60}")
    
    for seq_length in seq_lengths:
        # Create a simple input
        input_ids = torch.ones((1, seq_length), dtype=torch.long, device=device)
        
        # Warm up
        for _ in range(3):
            with torch.no_grad():
                model(input_ids)
        
        # Benchmark
        trials = 10
        start_time = time.time()
        for _ in range(trials):
            with torch.no_grad():
                model(input_ids)
        end_time = time.time()
        
        # Calculate metrics
        elapsed_time = (end_time - start_time) / trials * 1000  # ms
        tokens_per_sec = seq_length / (elapsed_time / 1000)
        
        # Calculate theoretical FLOPs
        total_gflops, gflops_per_token, _ = calculate_transformer_flops(config, seq_length)
        tflops = gflops_per_token * tokens_per_sec / 1000  # Convert to TFLOPs
        
        # Model FLOPs Utilization (%)
        mfu = (tflops / peak_tflops) * 100
        
        print(f"{seq_length:<12} {elapsed_time:<12.2f} {tokens_per_sec:<12.2f} {tflops:<12.2f} {mfu:<12.2f}")
        
        results.append({
            "seq_length": seq_length,
            "time_ms": elapsed_time,
            "tokens_per_sec": tokens_per_sec,
            "tflops": tflops,
            "mfu": mfu
        })
    
    return results


class Args:
    def __init__(self, model_id, quantization, seq_length=128, benchmark=True):
        self.model_id = model_id
        self.quantization = quantization
        self.seq_length = seq_length
        self.benchmark = benchmark


args = Args("", "", 128, True)

total_gflops, gflops_per_token, total_params = calculate_transformer_flops(config, seq_length=args.seq_length)

# Print model information
print("\nModel Information:")
print(f"Model: {args.model_id}")
print(f"Quantization: {args.quantization if args.quantization else 'None (full precision)'}")
print(f"Parameters: {total_params:,}")
print(f"Hidden Size: {config.hidden_size}")
print(f"Layers: {config.num_hidden_layers}")
print(f"Attention Heads: {config.num_attention_heads}")
if hasattr(config, 'num_key_value_heads'):
    print(f"KV Heads: {config.num_key_value_heads}")
print(f"Intermediate Size: {config.intermediate_size}")

# Print FLOPs information
print("\nFLOPs Information:")
print(f"Sequence Length: {args.seq_length}")
print(f"Total GFLOPs (forward pass): {total_gflops:.2f}")
print(f"GFLOPs per token: {gflops_per_token:.2f}")

# Run benchmark if requested
if args.benchmark:
    benchmark_inference(model,tokenizer)

In [4]:
# Save Path
# save_path = "./magnitude_pruned_llama_2_7B"

save_path = "./test"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('./test/tokenizer_config.json',
 './test/special_tokens_map.json',
 './test/tokenizer.model',
 './test/added_tokens.json',
 './test/tokenizer.json')

In [7]:
!lm-eval --model hf \
  --model_args pretrained=./test \
  --tasks acp_bench,anli,commonsense_qa,mmlu,piqa,winogender \
  --device cuda \
  --limit 10
  # --output output/hellaswag/
  # --log_samples
# Wanda-pruned-Llama-2-7b-0.3
# Tasks - acp_bench,anli,commonsense_qa,mmlu,piqa,pubmedqa,winogender

2025-04-12:20:29:55 INFO     [__main__:440] Selected Tasks: ['acp_bench', 'anli', 'commonsense_qa', 'mmlu', 'piqa', 'winogender']
2025-04-12:20:29:55 INFO     [evaluator:185] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-04-12:20:29:55 INFO     [evaluator:223] Initializing hf model, with arguments: {'pretrained': './test'}
2025-04-12:20:29:55 INFO     [models.huggingface:136] Using device 'cuda'
2025-04-12:20:29:55 INFO     [models.huggingface:377] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda'}
Loading checkpoint shards: 100%|██████████████████| 6/6 [00:03<00:00,  1.80it/s]
Some weights of LlamaForCausalLM were not initialized from the model checkpoint at ./test and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
README.md: 100%|████████████████████