Experiment Tracking: Weights & Biases (WandB)

Experiements/Metircs added:

* Threshold Study

* Perplexity Metric

* Efficiency Metrics

* Aggressive Compression

* FP8 Comparison

* Block-wise sensitivity checks

* Choose other models (Llama-3, Mistral)



# Setup & Dependencies

Goal: Install libraries for quantization (bitsandbytes, auto-gptq) and evaluation

In [None]:
!pip uninstall transformers torch torchaudio torchvision wandb -y
!pip install llmcompressor
!pip install -q accelerate bitsandbytes datasets scipy matplotlib wandb

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset, concatenate_datasets
from datasets import Dataset
import copy
import gc
import time # Added for efficiency metrics
from tqdm import tqdm
import shutil
import wandb # Added for tracking

# Login to WandB (Will prompt for API key)
wandb.login()

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Found existing installation: transformers 4.56.2
Uninstalling transformers-4.56.2:
  Successfully uninstalled transformers-4.56.2
Found existing installation: torch 2.8.0
Uninstalling torch-2.8.0:
  Successfully uninstalled torch-2.8.0
[0mFound existing installation: wandb 0.23.1
Uninstalling wandb-0.23.1:
  Successfully uninstalled wandb-0.23.1
Collecting torch<=2.8.0,>=2.7.0 (from llmcompressor)
  Using cached torch-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting transformers<=4.56.2,>=4.53.0 (from llmcompressor)
  Using cached transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
Using cached torch-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl (887.9 MB)
Using cached transformers-4.56.2-py3-none-any.whl (11.6 MB)
Installing collected packages: torch, transformers
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
timm 1.0.22 requires

[34m[1mwandb[0m: Currently logged in as: [33myq171014[0m ([33myq171014-columbia-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Using device: cuda


In [None]:
import random
import numpy as np
from transformers import set_seed

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
set_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Configuration & Experiment Controls

Goal: Define all variables in one place. This makes switching models or thresholds easy.

Design: Use lists for models and thresholds to loop through later.

**------------------------------ Added ------------------------------------**

In [None]:
# --- Experiment Settings ---
MODELS_TO_TEST = ["Qwen/Qwen2.5-0.5B-Instruct", "mistralai/Mistral-7B-Instruct-v0.2"]
SENSITIVITY_THRESHOLDS = [0.0, 0.01, 0.05, 0.10, 0.20] # Reduced slightly for speed, add 0.20 if time permits
CALIBRATION_SAMPLES = 128
EVAL_SAMPLES = 200 # Keep small for fast iteration, increase for final paper

# Toggle Features
ENABLE_FP8_COMPARISON = True
ENABLE_BLOCK_WISE = False # Set to True if you want to test block-level granularity
WANDB_PROJECT_NAME = "KLD_Quantization_Project" # Name for your dashboard

# Helper Functions (Metrics)
Goal: Define how you measure success.

New Code: Add Perplexity and Efficiency timers.

**------------------------------ Added------------------------------------**

In [None]:
# --- 0. Core Quantization Helpers ---
def fake_quantize_tensor_rtn(w, bits=4):
    """Simulate Round-to-Nearest Quantization"""
    max_val = w.abs().max()
    scale = max_val / (2**(bits-1) - 1)
    return (w / scale).round().clamp(-8, 7) * scale

def recursive_getattr(obj, attr):
    for part in attr.split('.'):
        obj = getattr(obj, part)
    return obj

def recursive_setattr(obj, attr, val):
    pre, _, post = attr.rpartition('.')
    parent = recursive_getattr(obj, pre) if pre else obj
    setattr(parent, post, val)

# --- 1. Metrics Helpers ---
def compute_kld(logits_p, logits_q):
    p_probs = F.softmax(logits_p, dim=-1)
    q_log_probs = F.log_softmax(logits_q, dim=-1)
    return nn.KLDivLoss(reduction='batchmean')(q_log_probs, p_probs).item()

def calculate_flip_rate(base_preds, new_preds):
    """Calculates % of answers that changed from the baseline."""
    if not base_preds or not new_preds: return 0.0
    flips = sum([1 for b, n in zip(base_preds, new_preds) if b != n])
    return flips / len(base_preds)

def compute_perplexity(model, tokenizer):
    """Computes perplexity on a subset of WikiText-2"""
    encodings = tokenizer("\n\n".join(load_dataset("wikitext", "wikitext-2-raw-v1", split="test")["text"][:20]), return_tensors="pt")
    max_length = model.config.max_position_embeddings
    stride = 512
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride), desc="Computing PPL"):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            nlls.append(outputs.loss)

        prev_end_loc = end_loc
        if end_loc == seq_len: break

    return torch.exp(torch.stack(nlls).mean()).item()

def measure_efficiency(model, tokenizer, input_text="Hello world"):
    """Measures Inference Latency and Peak VRAM Usage"""
    input_ids = tokenizer(input_text, return_tensors="pt").to(device)
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

    start_time = time.time()
    with torch.no_grad():
        # Generate 50 tokens to average out overhead
        _ = model.generate(**input_ids, max_new_tokens=50, min_new_tokens=50)
    end_time = time.time()

    peak_mem = torch.cuda.max_memory_allocated() / 1024**3 # GB
    latency = (end_time - start_time) # Seconds
    return latency, peak_mem

# --- 2. MMLU Logic ---
def format_mmlu_prompt(example):
    options = [f"{label}. {example['choices'][i]}" for i, label in enumerate(['A', 'B', 'C', 'D'])]
    prompt_text = f"Question: {example['question']}\nOptions:\n" + "\n".join(options) + "\nAnswer:"
    messages = [
        {"role": "system", "content": "Output only the single letter (A, B, C, or D) corresponding to the correct answer."},
        {"role": "user", "content": prompt_text}
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

def get_mmlu_predictions(model, dataset, num_samples):
    predictions, ground_truths = [], []
    choices = ["A", "B", "C", "D"]
    choice_ids = [tokenizer.encode(c)[0] for c in choices]

    for i in tqdm(range(min(num_samples, len(dataset))), desc="MMLU Eval"):
        ex = dataset[i]
        inputs = tokenizer(format_mmlu_prompt(ex), return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits[0, -1, choice_ids]
            pred = choices[torch.argmax(logits).item()]
        predictions.append(pred)
        ground_truths.append(choices[ex['answer']])
    return predictions, ground_truths

# --- 3. Wrapper Function ---
def evaluate_full_suite(model, tokenizer, dataset, metric_name):
    """Runs all metrics and returns them."""
    print(f"--- Evaluating: {metric_name} ---")

    # 1. Accuracy
    preds, truths = get_mmlu_predictions(model, dataset, EVAL_SAMPLES)
    acc = sum([1 for p, g in zip(preds, truths) if p == g]) / len(truths)

    # 2. Perplexity
    ppl = compute_perplexity(model, tokenizer)

    # 3. Efficiency
    lat, mem = measure_efficiency(model, tokenizer)

    print(f"Results -> Acc: {acc:.2%}, PPL: {ppl:.2f}, Latency: {lat:.2f}s, Mem: {mem:.2f}GB")
    return acc, ppl, lat, mem, preds

# Advanced Sensitivity Profiling
Goal: Identify which parts of the model to keep.

Refinement: Add a granularity parameter to switch between "Layer-by-Layer" (your current code) and "Block-by-Block" (checking a whole Transformer block at once).

**------------------------------ Added------------------------------------**

In [None]:
def profile_sensitivity(model, calib_input, granularity='layer'):
    """
    Profiles sensitivity of model parts (layers or blocks) using KLD.

    Args:
        model: The FP16 baseline model.
        calib_input: The input tensor for calibration (e.g., calib_input).
        granularity: 'layer' (check individual Linear layers) or 'block' (check whole Transformer blocks).

    Returns:
        sensitivity_scores: Dict mapping name -> KLD score.
    """
    print(f"Profiling Sensitivity (Granularity: {granularity})...")

    # 1. Compute Baseline Logits (Ground Truth)
    # We need these to compare against the perturbed versions
    with torch.no_grad():
        base_logits = model(calib_input).logits

    sensitivity_scores = {}

    # --- OPTION A: Layer-by-Layer (Your Original Logic) ---
    if granularity == 'layer':
        # Identify all Linear layers
        linear_layers = {name: m for name, m in model.named_modules() if isinstance(m, nn.Linear)}

        for name, layer in tqdm(linear_layers.items(), desc="Profiling Layers"):
            # A. Backup original weights
            original_weight = layer.weight.data.clone()

            # B. Perturb with RTN (Fake Quantization)
            layer.weight.data = fake_quantize_tensor_rtn(original_weight)

            # C. Measure Distortion (KLD)
            with torch.no_grad():
                # We measure how much the output deviates from base_logits
                perturbed_logits = model(calib_input).logits
                kld = compute_kld(base_logits, perturbed_logits)
                sensitivity_scores[name] = kld

            # D. Restore original weights (Crucial!)
            layer.weight.data = original_weight

    # --- OPTION B: Block-by-Block (New Feature) ---
    elif granularity == 'block':
        # Identify Transformer Blocks
        # Note: For Qwen/Llama/Mistral, blocks are usually in model.model.layers
        if hasattr(model, 'model') and hasattr(model.model, 'layers'):
            blocks = model.model.layers
            prefix = "model.model.layers"
        else:
            # Fallback for other architectures if needed
            print("Warning: Could not auto-detect block structure. Falling back to layer profiling.")
            return profile_sensitivity(model, calib_input, granularity='layer')

        for i, block in enumerate(tqdm(blocks, desc="Profiling Blocks")):
            block_name = f"{prefix}.{i}"

            # A. Backup ALL weights in this block
            # We need to find every Linear layer *inside* this block
            block_linears = {n: m for n, m in block.named_modules() if isinstance(m, nn.Linear)}
            backup_weights = {n: m.weight.data.clone() for n, m in block_linears.items()}

            # B. Perturb ALL weights in this block simultaneously
            for n, m in block_linears.items():
                m.weight.data = fake_quantize_tensor_rtn(m.weight.data)

            # C. Measure Distortion (KLD)
            with torch.no_grad():
                perturbed_logits = model(calib_input).logits
                kld = compute_kld(base_logits, perturbed_logits)
                sensitivity_scores[block_name] = kld

            # D. Restore ALL weights in this block
            for n, m in block_linears.items():
                m.weight.data = backup_weights[n]

    return sensitivity_scores

#The "Surgery" Implementation

Goal: The mechanism to mix FP16 and Int4 layers.

In [None]:
def perform_surgery(model, sensitive_names, fp16_model_cpu):
    """
    Replaces the sensitive quantized layers in 'model' (GPU)
    with the original FP16 layers from 'fp16_model_cpu' (CPU).
    """
    count = 0
    print(f"Surgery: Replacing {len(sensitive_names)} Sensitive Layers with FP16...")

    for name in sensitive_names:
        try:
            # 1. Get original FP16 weight from CPU backup
            original_layer = recursive_getattr(fp16_model_cpu, name)

            # 2. Create new Linear layer on GPU
            new_layer = nn.Linear(
                in_features=original_layer.in_features,
                out_features=original_layer.out_features,
                bias=(original_layer.bias is not None)
            )
            new_layer.weight.data = original_layer.weight.data.to(model.device)
            if original_layer.bias is not None:
                new_layer.bias.data = original_layer.bias.data.to(model.device)

            # 3. Swap into the quantized model
            recursive_setattr(model, name, new_layer)
            count += 1
        except Exception as e:
            print(f"Skipping layer {name}: {e}")

    print(f"Surgery Complete: {count} layers restored.")

# Main Experiment Loop ( The Core )
Goal: Run the full battery of tests.

In [None]:
import wandb
import pandas as pd
from datasets import load_dataset, concatenate_datasets

# 1. Login to WandB (if not already logged in)
# wandb.login()

# 2. Global Results List (Data persists across cells)
if 'results_table' not in globals():
    results_table = []

# 3. WandB Project Name
WANDB_PROJECT_NAME = "KLD_Quantization_Project"

# 4. Load MMLU Dataset
print("Loading MMLU Dataset...")
# We use 'elementary_mathematics' as the subset for this project
try:
    mmlu_dataset = concatenate_datasets([
        load_dataset("cais/mmlu", "elementary_mathematics", split='test')
    ])
    print(f"MMLU Dataset Loaded. Size: {len(mmlu_dataset)} samples.")
except Exception as e:
    print(f"Error loading MMLU: {e}")
    # Fallback to a dummy dataset if internet fails (prevents crashing)
    from datasets import Dataset
    mmlu_dataset = Dataset.from_dict({
        "question": ["1+1=?"], "choices": [["1", "2", "3", "4"]], "answer": [1]
    })

print("Global setup complete. Ready for Step 2.")

Loading MMLU Dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

dataset_infos.json: 0.00B [00:00, ?B/s]

elementary_mathematics/test-00000-of-000(…):   0%|          | 0.00/41.1k [00:00<?, ?B/s]

elementary_mathematics/validation-00000-(…):   0%|          | 0.00/9.38k [00:00<?, ?B/s]

elementary_mathematics/dev-00000-of-0000(…):   0%|          | 0.00/4.55k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/378 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/41 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

MMLU Dataset Loaded. Size: 378 samples.
Global setup complete. Ready for Step 2.


In [None]:
# Model Selection & Baseline Evaluation

# -----------------------------------------------------------------------------
# 1. SELECT YOUR MODEL HERE
# -----------------------------------------------------------------------------
# To test Mistral later, change this index to [1]
CURRENT_MODEL_ID = MODELS_TO_TEST[0]

print(f"{'='*40}\nSelected Model: {CURRENT_MODEL_ID}\n{'='*40}")

# 2. Load Tokenizer & Model (FP16)
tokenizer = AutoTokenizer.from_pretrained(CURRENT_MODEL_ID)
print("Loading FP16 Baseline (This may take a minute)...")
model_fp16 = AutoModelForCausalLM.from_pretrained(
    CURRENT_MODEL_ID,
    dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# 3. Evaluate Baseline (Ground Truth)
# We capture 'base_preds' here to compare all other models against it
base_acc, base_ppl, base_lat, base_mem, base_preds = evaluate_full_suite(
    model_fp16, tokenizer, mmlu_dataset, "FP16 Baseline"
)

# 4. Log Baseline to WandB
run = wandb.init(project=WANDB_PROJECT_NAME, name=f"{CURRENT_MODEL_ID.split('/')[-1]}-Baseline", reinit=True)
wandb.log({
    "Accuracy": base_acc,
    "Perplexity": base_ppl,
    "Latency": base_lat,
    "Memory": base_mem,
    "Threshold": 0,
    "Flip_Rate": 0.0,
    "Method": "Baseline"
})
run.finish()

# 5. Store in Results Table (FIXED: Added Latency & Flip)
results_table.append({
    "Model": CURRENT_MODEL_ID,
    "Method": "FP16 Baseline",
    "Threshold": 0,
    "Acc": base_acc,
    "Flip": 0.0,
    "PPL": base_ppl,
    "Latency": base_lat,  # <--- Added
    "Mem": base_mem
})

print("Baseline Loaded & Evaluated.")

Selected Model: mistralai/Mistral-7B-Instruct-v0.2


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Loading FP16 Baseline (This may take a minute)...


config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

--- Evaluating: FP16 Baseline ---


MMLU Eval: 100%|██████████| 200/200 [00:08<00:00, 23.14it/s]


README.md: 0.00B [00:00, ?B/s]

wikitext-2-raw-v1/test-00000-of-00001.pa(…):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-2-raw-v1/train-00000-of-00001.p(…):   0%|          | 0.00/6.36M [00:00<?, ?B/s]

wikitext-2-raw-v1/validation-00000-of-00(…):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Computing PPL:   0%|          | 0/4 [00:00<?, ?it/s]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Results -> Acc: 22.00%, PPL: 4.02, Latency: 2.02s, Mem: 13.50GB




0,1
Accuracy,▁
Flip_Rate,▁
Latency,▁
Memory,▁
Perplexity,▁
Threshold,▁

0,1
Accuracy,0.22
Flip_Rate,0
Latency,2.02148
Memory,13.50399
Method,Baseline
Perplexity,4.01865
Threshold,0


Baseline Loaded & Evaluated.


In [None]:
# Profiling & Offloading

# 1. Prepare Calibration Data for KLD
print("Preparing Calibration Data...")
calib_data = tokenizer(
    "\n\n".join(load_dataset("wikitext", "wikitext-2-raw-v1", split="test")["text"][:10]),
    return_tensors="pt"
).input_ids.to(device)

# 2. Run Profiling
print("Profiling Sensitivity (This determines which layers to save)...")
granularity_mode = 'block' if ENABLE_BLOCK_WISE else 'layer'
sensitivity_map = profile_sensitivity(model_fp16, calib_data, granularity=granularity_mode)

print(f"Profiling Complete. Mapped {len(sensitivity_map)} items.")

# 3. Offload FP16 Model to CPU
# CRITICAL: We move the big model to CPU RAM so the GPU is free for quantization
print("Moving FP16 model to CPU to free up VRAM...")
model_fp16.cpu()
torch.cuda.empty_cache()
print("VRAM Cleared. Ready for Experiments.")

Preparing Calibration Data...
Profiling Sensitivity (This determines which layers to save)...
Profiling Sensitivity (Granularity: layer)...


Profiling Layers: 100%|██████████| 225/225 [00:12<00:00, 18.60it/s]


Profiling Complete. Mapped 225 items.
Moving FP16 model to CPU to free up VRAM...
VRAM Cleared. Ready for Experiments.


In [None]:
# Experiment A: Threshold Study (NF4)

print(f"\n--- Starting Experiment A: Threshold Study ({CURRENT_MODEL_ID}) ---")

# 0.0 = Standard NF4 (No KLD)
# 0.05 = KLD-Guided (5% layers restored)
thresholds_to_test = [0.0, 0.01, 0.05, 0.10, 0.20]

for threshold in thresholds_to_test:
    print(f"\nTesting Threshold: {threshold:.0%} kept in FP16")

    run_name = f"{CURRENT_MODEL_ID.split('/')[-1]}-NF4-{threshold}"
    run = wandb.init(
        project=WANDB_PROJECT_NAME,
        name=run_name,
        config={"model": CURRENT_MODEL_ID, "threshold": threshold, "method": "KLD-NF4"}
    )

    # 1. Identify Layers to Keep
    sorted_layers = sorted(sensitivity_map.items(), key=lambda x: x[1], reverse=True)
    num_keep = int(len(sorted_layers) * threshold)
    sensitive_layers = [n for n, s in sorted_layers[:num_keep]]

    # 2. Load Standard NF4 Model
    model_nf4 = AutoModelForCausalLM.from_pretrained(
        CURRENT_MODEL_ID,
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        ),
        device_map="auto",
        trust_remote_code=True
    )

    # 3. Perform Surgery (Only if threshold > 0)
    if len(sensitive_layers) > 0:
        perform_surgery(model_nf4, sensitive_layers, model_fp16)

    # 4. Evaluate
    acc, ppl, lat, mem, preds = evaluate_full_suite(
        model_nf4, tokenizer, mmlu_dataset, f"KLD-NF4-{threshold}"
    )

    # 5. Calculate Flip Rate
    flip = calculate_flip_rate(base_preds, preds)

    # 6. Log & Save (FIXED)
    wandb.log({
        "Accuracy": acc, "Perplexity": ppl, "Latency": lat,
        "Memory": mem, "Flip_Rate": flip, "Threshold": threshold
    })

    results_table.append({
        "Model": CURRENT_MODEL_ID,
        "Method": "KLD-NF4",
        "Threshold": threshold,
        "Acc": acc,
        "Flip": flip,   # <--- Fixed
        "PPL": ppl,
        "Latency": lat, # <--- Fixed
        "Mem": mem
    })

    del model_nf4
    torch.cuda.empty_cache()
    run.finish()

print("Experiment A Complete.")


--- Starting Experiment A: Threshold Study (mistralai/Mistral-7B-Instruct-v0.2) ---

Testing Threshold: 0% kept in FP16


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

--- Evaluating: KLD-NF4-0.0 ---


MMLU Eval: 100%|██████████| 200/200 [00:16<00:00, 12.39it/s]
Computing PPL:   0%|          | 0/4 [00:00<?, ?it/s]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Results -> Acc: 22.00%, PPL: 4.21, Latency: 2.61s, Mem: 4.28GB


0,1
Accuracy,▁
Flip_Rate,▁
Latency,▁
Memory,▁
Perplexity,▁
Threshold,▁

0,1
Accuracy,0.22
Flip_Rate,0.0
Latency,2.6119
Memory,4.27906
Perplexity,4.20779
Threshold,0.0



Testing Threshold: 1% kept in FP16


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Surgery: Replacing 2 Sensitive Layers with FP16...
Surgery Complete: 2 layers restored.
--- Evaluating: KLD-NF4-0.01 ---


MMLU Eval: 100%|██████████| 200/200 [00:16<00:00, 12.25it/s]
Computing PPL:   0%|          | 0/4 [00:00<?, ?it/s]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Results -> Acc: 22.00%, PPL: 4.15, Latency: 2.60s, Mem: 8.06GB


0,1
Accuracy,▁
Flip_Rate,▁
Latency,▁
Memory,▁
Perplexity,▁
Threshold,▁

0,1
Accuracy,0.22
Flip_Rate,0.0
Latency,2.59972
Memory,8.06042
Perplexity,4.15351
Threshold,0.01



Testing Threshold: 5% kept in FP16


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Surgery: Replacing 11 Sensitive Layers with FP16...
Surgery Complete: 11 layers restored.
--- Evaluating: KLD-NF4-0.05 ---


MMLU Eval: 100%|██████████| 200/200 [00:16<00:00, 12.41it/s]
Computing PPL:   0%|          | 0/4 [00:00<?, ?it/s]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Results -> Acc: 22.00%, PPL: 4.12, Latency: 2.59s, Mem: 12.72GB


0,1
Accuracy,▁
Flip_Rate,▁
Latency,▁
Memory,▁
Perplexity,▁
Threshold,▁

0,1
Accuracy,0.22
Flip_Rate,0.0
Latency,2.58634
Memory,12.72409
Perplexity,4.1183
Threshold,0.05



Testing Threshold: 10% kept in FP16


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Surgery: Replacing 22 Sensitive Layers with FP16...
Surgery Complete: 22 layers restored.
--- Evaluating: KLD-NF4-0.1 ---


MMLU Eval: 100%|██████████| 200/200 [00:15<00:00, 12.78it/s]
Computing PPL:   0%|          | 0/4 [00:00<?, ?it/s]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Results -> Acc: 22.00%, PPL: 4.11, Latency: 2.59s, Mem: 16.96GB


0,1
Accuracy,▁
Flip_Rate,▁
Latency,▁
Memory,▁
Perplexity,▁
Threshold,▁

0,1
Accuracy,0.22
Flip_Rate,0.0
Latency,2.59016
Memory,16.95856
Perplexity,4.11005
Threshold,0.1



Testing Threshold: 20% kept in FP16


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Surgery: Replacing 45 Sensitive Layers with FP16...
Surgery Complete: 45 layers restored.
--- Evaluating: KLD-NF4-0.2 ---


MMLU Eval: 100%|██████████| 200/200 [00:14<00:00, 13.59it/s]
Computing PPL:   0%|          | 0/4 [00:00<?, ?it/s]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Results -> Acc: 22.00%, PPL: 4.11, Latency: 2.56s, Mem: 23.07GB


0,1
Accuracy,▁
Flip_Rate,▁
Latency,▁
Memory,▁
Perplexity,▁
Threshold,▁

0,1
Accuracy,0.22
Flip_Rate,0.0
Latency,2.55655
Memory,23.07096
Perplexity,4.11295
Threshold,0.2


Experiment A Complete.


In [None]:
# Experiment B: Int8 Comparison

if ENABLE_FP8_COMPARISON:
    print(f"\n--- Starting Experiment B: Int8 Comparison ({CURRENT_MODEL_ID}) ---")

    best_threshold = 0.05

    run = wandb.init(
        project=WANDB_PROJECT_NAME,
        name=f"{CURRENT_MODEL_ID.split('/')[-1]}-Int8",
        config={"model": CURRENT_MODEL_ID, "method": "KLD-Int8"}
    )

    # Identify Layers
    sorted_layers = sorted(sensitivity_map.items(), key=lambda x: x[1], reverse=True)
    sensitive_layers = [n for n, s in sorted_layers[:int(len(sorted_layers)*best_threshold)]]

    # Load Int8
    model_int8 = AutoModelForCausalLM.from_pretrained(
        CURRENT_MODEL_ID,
        load_in_8bit=True,
        device_map="auto"
    )

    # Surgery
    perform_surgery(model_int8, sensitive_layers, model_fp16)

    # Evaluate
    acc, ppl, lat, mem, preds = evaluate_full_suite(
        model_int8, tokenizer, mmlu_dataset, "KLD-Int8"
    )

    flip = calculate_flip_rate(base_preds, preds)

    wandb.log({
        "Accuracy": acc, "Perplexity": ppl, "Latency": lat,
        "Memory": mem, "Flip_Rate": flip
    })

    results_table.append({
        "Model": CURRENT_MODEL_ID,
        "Method": "KLD-Int8",
        "Threshold": best_threshold,
        "Acc": acc,
        "Flip": flip,   # <--- Fixed
        "PPL": ppl,
        "Latency": lat, # <--- Fixed
        "Mem": mem
    })

    del model_int8
    torch.cuda.empty_cache()
    run.finish()
else:
    print("Experiment B skipped.")


--- Starting Experiment B: Int8 Comparison (mistralai/Mistral-7B-Instruct-v0.2) ---


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Surgery: Replacing 11 Sensitive Layers with FP16...
Surgery Complete: 11 layers restored.
--- Evaluating: KLD-Int8 ---


MMLU Eval: 100%|██████████| 200/200 [00:49<00:00,  4.05it/s]
Computing PPL:   0%|          | 0/4 [00:00<?, ?it/s]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Results -> Acc: 22.00%, PPL: 4.00, Latency: 9.03s, Mem: 25.35GB


0,1
Accuracy,▁
Flip_Rate,▁
Latency,▁
Memory,▁
Perplexity,▁

0,1
Accuracy,0.22
Flip_Rate,0.0
Latency,9.03115
Memory,25.35446
Perplexity,4.00192


In [None]:
# Experiment C: Aggressive Compression (Mixed 2-bit/FP16)

print(f"\n--- Starting Experiment C: Aggressive Compression ({CURRENT_MODEL_ID}) ---")

run = wandb.init(
    project=WANDB_PROJECT_NAME,
    name=f"{CURRENT_MODEL_ID.split('/')[-1]}-Aggressive",
    config={"model": CURRENT_MODEL_ID, "method": "Mixed-2bit"}
)

# 1. Create a Fresh Copy of FP16 on GPU
model_aggressive = copy.deepcopy(model_fp16)
model_aggressive.to(device)

# 2. Identify the "Safe" layers (Keep top 5% safe in FP16)
sorted_layers = sorted(sensitivity_map.items(), key=lambda x: x[1], reverse=True)
sensitive_set = set([n for n, s in sorted_layers[:int(len(sorted_layers)*0.05)]])

# 3. Apply Fake 2-bit Quantization to everything ELSE
print("Applying simulated 2-bit quantization to 95% of layers...")
for name, module in tqdm(model_aggressive.named_modules()):
    if isinstance(module, nn.Linear) and name not in sensitive_set:
        module.weight.data = fake_quantize_tensor_rtn(module.weight.data, bits=2)

# 4. Evaluate
acc, ppl, lat, mem, preds = evaluate_full_suite(
    model_aggressive, tokenizer, mmlu_dataset, "Mixed 2-bit/FP16"
)

flip = calculate_flip_rate(base_preds, preds)

wandb.log({
    "Accuracy": acc, "Perplexity": ppl, "Latency": lat,
    "Memory": mem, "Flip_Rate": flip
})

results_table.append({
    "Model": CURRENT_MODEL_ID,
    "Method": "Mixed-2bit",
    "Threshold": 0.05,
    "Acc": acc,
    "Flip": flip,   # <--- Fixed
    "PPL": ppl,
    "Latency": lat, # <--- Fixed
    "Mem": mem
})

del model_aggressive
torch.cuda.empty_cache()
run.finish()


--- Starting Experiment C: Aggressive Compression (mistralai/Mistral-7B-Instruct-v0.2) ---


Applying simulated 2-bit quantization to 95% of layers...


423it [00:00, 8591.14it/s]


--- Evaluating: Mixed 2-bit/FP16 ---


MMLU Eval: 100%|██████████| 200/200 [00:08<00:00, 23.98it/s]
Computing PPL:   0%|          | 0/4 [00:00<?, ?it/s]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Results -> Acc: 22.00%, PPL: 47550.81, Latency: 1.87s, Mem: 31.23GB


0,1
Accuracy,▁
Flip_Rate,▁
Latency,▁
Memory,▁
Perplexity,▁

0,1
Accuracy,0.22
Flip_Rate,0.0
Latency,1.86615
Memory,31.23389
Perplexity,47550.8125


In [None]:
# Experiment D: AWQ Baseline

print(f"\n--- Starting Experiment D: AWQ Baseline ({CURRENT_MODEL_ID}) ---")

try:
    from llmcompressor.modifiers.awq import AWQModifier
    from llmcompressor import oneshot

    run = wandb.init(
        project=WANDB_PROJECT_NAME,
        name=f"{CURRENT_MODEL_ID.split('/')[-1]}-AWQ",
        config={"model": CURRENT_MODEL_ID, "method": "AWQ"}
    )

    # 1. Calibration Data
    ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    calib_data_obj = Dataset.from_dict({"text": [text for text in ds["text"] if len(text) > 0][:128]})

    # 2. Run Oneshot AWQ
    print("Running AWQ Oneshot Quantization...")
    recipe = [AWQModifier(targets="Linear", scheme="W4A16")]
    oneshot(
        model=CURRENT_MODEL_ID,
        dataset=calib_data_obj,
        recipe=recipe,
        output_dir="./awq_temp",
        num_calibration_samples=128,
        max_seq_length=512,
        save_compressed=True
    )

    # 3. Load & Eval
    model_awq = AutoModelForCausalLM.from_pretrained(
        "./awq_temp", device_map="auto", trust_remote_code=True
    )
    acc, ppl, lat, mem, preds = evaluate_full_suite(
        model_awq, tokenizer, mmlu_dataset, "AWQ Standard"
    )

    flip = calculate_flip_rate(base_preds, preds)

    wandb.log({
        "Accuracy": acc, "Perplexity": ppl, "Latency": lat,
        "Memory": mem, "Flip_Rate": flip
    })

    results_table.append({
        "Model": CURRENT_MODEL_ID,
        "Method": "AWQ",
        "Threshold": 0,
        "Acc": acc,
        "Flip": flip,   # <--- Fixed
        "PPL": ppl,
        "Latency": lat, # <--- Fixed
        "Mem": mem
    })

    shutil.rmtree("./awq_temp")
    del model_awq
    torch.cuda.empty_cache()
    run.finish()

except Exception as e:
    print(f"Skipping AWQ: {e}")


--- Starting Experiment D: AWQ Baseline (mistralai/Mistral-7B-Instruct-v0.2) ---


Running AWQ Oneshot Quantization...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/128 [00:00<?, ? examples/s]

2025-12-04T23:01:05.962756+0000 | reset | INFO - Compression lifecycle reset
2025-12-04T23:01:05.972916+0000 | _create_default_logger | INFO - Logging all LLM Compressor modifier-level logs to sparse_logs/04-12-2025_23.01.05.log
2025-12-04T23:01:05.974720+0000 | from_modifiers | INFO - Creating recipe from modifiers
2025-12-04T23:01:06.017786+0000 | on_initialize | INFO - No AWQModifier.mappings provided, inferring from model...


Resolving mapping 1/4 (0 skipped): : 32it [00:00, 860.36it/s]
Resolving mapping 2/4 (31 skipped): : 32it [00:00, 1020.09it/s]
Resolving mapping 3/4 (0 skipped): : 32it [00:00, 885.94it/s]
Resolving mapping 4/4 (0 skipped): : 32it [00:00, 939.45it/s]

2025-12-04T23:01:06.171783+0000 | initialize | INFO - Compression lifecycle initialized for 1 modifiers
2025-12-04T23:01:06.172910+0000 | IndependentPipeline | INFO - Inferred `SequentialPipeline` for `AWQModifier`



Preparing cache: 100%|██████████| 128/128 [00:01<00:00, 99.22it/s]
(1/33): Calibrating: 100%|██████████| 128/128 [00:02<00:00, 62.51it/s]
Smoothing: 100%|██████████| 3/3 [00:08<00:00,  2.95s/it]
(1/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 244.70it/s]
(2/33): Calibrating: 100%|██████████| 128/128 [00:02<00:00, 51.69it/s]
Smoothing: 100%|██████████| 3/3 [00:08<00:00,  2.95s/it]
(2/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 223.28it/s]
(3/33): Calibrating: 100%|██████████| 128/128 [00:01<00:00, 69.50it/s]
Smoothing: 100%|██████████| 3/3 [00:08<00:00,  2.95s/it]
(3/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 222.35it/s]
(4/33): Calibrating: 100%|██████████| 128/128 [00:01<00:00, 65.31it/s]
Smoothing: 100%|██████████| 3/3 [00:08<00:00,  2.95s/it]
(4/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 224.44it/s]
(5/33): Calibrating: 100%|██████████| 128/128 [00:01<00:00, 67.38it/s]
Smoothing: 100%|██████████| 3/3 [00:08<00:00,  2.96s/it]
(5/3

2025-12-04T23:07:24.282128+0000 | finalize | INFO - Compression lifecycle finalized for 1 modifiers
2025-12-04T23:07:24.348037+0000 | get_model_compressor | INFO - skip_sparsity_compression_stats set to True. Skipping sparsity compression statistic calculations. No sparsity compressor will be applied.


Compressing model: 225it [00:18, 12.47it/s]
Compressing model: 225it [00:00, 927.67it/s]


--- Evaluating: AWQ Standard ---


MMLU Eval: 100%|██████████| 200/200 [00:48<00:00,  4.15it/s]
Computing PPL:   0%|          | 0/4 [00:00<?, ?it/s]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Results -> Acc: 22.00%, PPL: 4.04, Latency: 11.25s, Mem: 18.38GB


0,1
Accuracy,▁
Flip_Rate,▁
Latency,▁
Memory,▁
Perplexity,▁

0,1
Accuracy,0.22
Flip_Rate,0.0
Latency,11.25171
Memory,18.38286
Perplexity,4.04441


In [None]:
# Experiment E: GPTQ Standard Baseline

print(f"\n--- Starting Experiment E: GPTQ Baseline ({CURRENT_MODEL_ID}) ---")

try:
    from llmcompressor.modifiers.quantization import GPTQModifier
    from llmcompressor import oneshot

    run = wandb.init(
        project=WANDB_PROJECT_NAME,
        name=f"{CURRENT_MODEL_ID.split('/')[-1]}-GPTQ",
        config={"model": CURRENT_MODEL_ID, "method": "GPTQ"}
    )

    ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    calib_data_obj = Dataset.from_dict({"text": [text for text in ds["text"] if len(text) > 0][:128]})

    # W4A16 = 4-bit Weights, 16-bit Activations
    recipe = [
        GPTQModifier(
            targets="Linear",
            scheme="W4A16",
            ignore=["lm_head"],
            dampening_frac=0.01
        )
    ]

    print("Running GPTQ Optimization...")
    oneshot(
        model=CURRENT_MODEL_ID,
        dataset=calib_data_obj,
        recipe=recipe,
        output_dir="./gptq_temp",
        num_calibration_samples=128,
        max_seq_length=512,
        save_compressed=True
    )

    print("Loading GPTQ Model...")
    model_gptq = AutoModelForCausalLM.from_pretrained(
        "./gptq_temp",
        device_map="auto",
        trust_remote_code=True
    )

    acc, ppl, lat, mem, preds = evaluate_full_suite(
        model_gptq, tokenizer, mmlu_dataset, "GPTQ Standard"
    )

    flip = calculate_flip_rate(base_preds, preds)

    wandb.log({
        "Accuracy": acc, "Perplexity": ppl, "Latency": lat,
        "Memory": mem, "Flip_Rate": flip
    })

    results_table.append({
        "Model": CURRENT_MODEL_ID,
        "Method": "GPTQ",
        "Threshold": 0,
        "Acc": acc,
        "Flip": flip,   # <--- Fixed
        "PPL": ppl,
        "Latency": lat, # <--- Fixed
        "Mem": mem
    })

    shutil.rmtree("./gptq_temp")
    del model_gptq
    torch.cuda.empty_cache()
    run.finish()

except Exception as e:
    print(f"Skipping GPTQ: {e}")

print("Experiment E Complete.")


--- Starting Experiment E: GPTQ Baseline (mistralai/Mistral-7B-Instruct-v0.2) ---


Running GPTQ Optimization...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/128 [00:00<?, ? examples/s]

2025-12-04T23:09:04.930712+0000 | reset | INFO - Compression lifecycle reset
2025-12-04T23:09:04.934337+0000 | from_modifiers | INFO - Creating recipe from modifiers
2025-12-04T23:09:04.981622+0000 | initialize | INFO - Compression lifecycle initialized for 1 modifiers
2025-12-04T23:09:04.983185+0000 | IndependentPipeline | INFO - Inferred `SequentialPipeline` for `GPTQModifier`


Preparing cache: 100%|██████████| 128/128 [00:01<00:00, 65.95it/s]
(1/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.07it/s]

2025-12-04T23:09:11.981050+0000 | compress_modules | INFO - Quantizing model.layers.0.self_attn.q_proj using 128 samples





2025-12-04T23:09:14.510459+0000 | compress | METRIC - time 2.53s
2025-12-04T23:09:14.512176+0000 | compress | METRIC - error 21.12
2025-12-04T23:09:14.514001+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:09:14.515224+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:09:14.516610+0000 | compress_modules | INFO - Quantizing model.layers.0.self_attn.k_proj using 128 samples
2025-12-04T23:09:16.899179+0000 | compress | METRIC - time 2.38s
2025-12-04T23:09:16.901553+0000 | compress | METRIC - error 6.33
2025-12-04T23:09:16.902595+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:09:16.904052+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:09:16.905334+0000 | compress_modules | INFO - Quantizing model.layers.0.self_attn.v_proj using 128 samples
2025-12-04T23:09:19.271871+0000 | compress | METRIC - time 2.37s
2025-12-04T23:09:19.274315+0000 | compress | METRIC - e

(1/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 162.94it/s]
(2/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.23it/s]

2025-12-04T23:09:40.722834+0000 | compress_modules | INFO - Quantizing model.layers.1.self_attn.q_proj using 128 samples





2025-12-04T23:09:43.176205+0000 | compress | METRIC - time 2.45s
2025-12-04T23:09:43.178596+0000 | compress | METRIC - error 17.41
2025-12-04T23:09:43.179953+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:09:43.181254+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:09:43.182539+0000 | compress_modules | INFO - Quantizing model.layers.1.self_attn.k_proj using 128 samples
2025-12-04T23:09:45.586500+0000 | compress | METRIC - time 2.40s
2025-12-04T23:09:45.589041+0000 | compress | METRIC - error 7.95
2025-12-04T23:09:45.590317+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:09:45.591581+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:09:45.592989+0000 | compress_modules | INFO - Quantizing model.layers.1.self_attn.v_proj using 128 samples
2025-12-04T23:09:47.971113+0000 | compress | METRIC - time 2.38s
2025-12-04T23:09:47.973487+0000 | compress | METRIC - e

(2/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 184.86it/s]
(3/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.46it/s]

2025-12-04T23:10:09.128878+0000 | compress_modules | INFO - Quantizing model.layers.2.self_attn.q_proj using 128 samples





2025-12-04T23:10:11.593983+0000 | compress | METRIC - time 2.46s
2025-12-04T23:10:11.595678+0000 | compress | METRIC - error 102.55
2025-12-04T23:10:11.596884+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:10:11.597911+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:10:11.599298+0000 | compress_modules | INFO - Quantizing model.layers.2.self_attn.k_proj using 128 samples
2025-12-04T23:10:13.966454+0000 | compress | METRIC - time 2.37s
2025-12-04T23:10:13.968805+0000 | compress | METRIC - error 49.56
2025-12-04T23:10:13.969969+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:10:13.970945+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:10:13.972488+0000 | compress_modules | INFO - Quantizing model.layers.2.self_attn.v_proj using 128 samples
2025-12-04T23:10:16.329154+0000 | compress | METRIC - time 2.36s
2025-12-04T23:10:16.331551+0000 | compress | METRIC -

(3/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 217.75it/s]
(4/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.39it/s]

2025-12-04T23:10:37.432995+0000 | compress_modules | INFO - Quantizing model.layers.3.self_attn.q_proj using 128 samples





2025-12-04T23:10:39.895046+0000 | compress | METRIC - time 2.46s
2025-12-04T23:10:39.897562+0000 | compress | METRIC - error 65.28
2025-12-04T23:10:39.899087+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:10:39.900501+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:10:39.901704+0000 | compress_modules | INFO - Quantizing model.layers.3.self_attn.k_proj using 128 samples
2025-12-04T23:10:42.267775+0000 | compress | METRIC - time 2.36s
2025-12-04T23:10:42.270203+0000 | compress | METRIC - error 31.79
2025-12-04T23:10:42.271524+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:10:42.272616+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:10:42.273995+0000 | compress_modules | INFO - Quantizing model.layers.3.self_attn.v_proj using 128 samples
2025-12-04T23:10:44.641572+0000 | compress | METRIC - time 2.37s
2025-12-04T23:10:44.644142+0000 | compress | METRIC - 

(4/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 213.59it/s]
(5/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.46it/s]

2025-12-04T23:11:05.654230+0000 | compress_modules | INFO - Quantizing model.layers.4.self_attn.q_proj using 128 samples





2025-12-04T23:11:08.096552+0000 | compress | METRIC - time 2.44s
2025-12-04T23:11:08.098919+0000 | compress | METRIC - error 109.95
2025-12-04T23:11:08.100492+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:11:08.101377+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:11:08.102940+0000 | compress_modules | INFO - Quantizing model.layers.4.self_attn.k_proj using 128 samples
2025-12-04T23:11:10.476475+0000 | compress | METRIC - time 2.37s
2025-12-04T23:11:10.478163+0000 | compress | METRIC - error 48.65
2025-12-04T23:11:10.479700+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:11:10.480946+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:11:10.482333+0000 | compress_modules | INFO - Quantizing model.layers.4.self_attn.v_proj using 128 samples
2025-12-04T23:11:12.858554+0000 | compress | METRIC - time 2.37s
2025-12-04T23:11:12.860874+0000 | compress | METRIC -

(5/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 218.40it/s]
(6/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.39it/s]

2025-12-04T23:11:33.884273+0000 | compress_modules | INFO - Quantizing model.layers.5.self_attn.q_proj using 128 samples





2025-12-04T23:11:36.374497+0000 | compress | METRIC - time 2.49s
2025-12-04T23:11:36.376976+0000 | compress | METRIC - error 161.86
2025-12-04T23:11:36.378688+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:11:36.379665+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:11:36.380974+0000 | compress_modules | INFO - Quantizing model.layers.5.self_attn.k_proj using 128 samples
2025-12-04T23:11:38.744169+0000 | compress | METRIC - time 2.36s
2025-12-04T23:11:38.746675+0000 | compress | METRIC - error 71.75
2025-12-04T23:11:38.748224+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:11:38.749625+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:11:38.751069+0000 | compress_modules | INFO - Quantizing model.layers.5.self_attn.v_proj using 128 samples
2025-12-04T23:11:41.121728+0000 | compress | METRIC - time 2.37s
2025-12-04T23:11:41.124436+0000 | compress | METRIC -

(6/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 217.74it/s]
(7/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.47it/s]

2025-12-04T23:12:02.054491+0000 | compress_modules | INFO - Quantizing model.layers.6.self_attn.q_proj using 128 samples





2025-12-04T23:12:04.522286+0000 | compress | METRIC - time 2.47s
2025-12-04T23:12:04.524891+0000 | compress | METRIC - error 146.31
2025-12-04T23:12:04.526232+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:12:04.527651+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:12:04.529065+0000 | compress_modules | INFO - Quantizing model.layers.6.self_attn.k_proj using 128 samples
2025-12-04T23:12:06.903079+0000 | compress | METRIC - time 2.37s
2025-12-04T23:12:06.905618+0000 | compress | METRIC - error 68.59
2025-12-04T23:12:06.906974+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:12:06.908419+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:12:06.909825+0000 | compress_modules | INFO - Quantizing model.layers.6.self_attn.v_proj using 128 samples
2025-12-04T23:12:09.268668+0000 | compress | METRIC - time 2.36s
2025-12-04T23:12:09.271243+0000 | compress | METRIC -

(7/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 216.93it/s]
(8/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.47it/s]

2025-12-04T23:12:30.315444+0000 | compress_modules | INFO - Quantizing model.layers.7.self_attn.q_proj using 128 samples





2025-12-04T23:12:32.764569+0000 | compress | METRIC - time 2.45s
2025-12-04T23:12:32.767052+0000 | compress | METRIC - error 181.67
2025-12-04T23:12:32.768387+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:12:32.769616+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:12:32.770939+0000 | compress_modules | INFO - Quantizing model.layers.7.self_attn.k_proj using 128 samples
2025-12-04T23:12:35.159650+0000 | compress | METRIC - time 2.39s
2025-12-04T23:12:35.162477+0000 | compress | METRIC - error 86.59
2025-12-04T23:12:35.164014+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:12:35.165379+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:12:35.166771+0000 | compress_modules | INFO - Quantizing model.layers.7.self_attn.v_proj using 128 samples
2025-12-04T23:12:37.536556+0000 | compress | METRIC - time 2.37s
2025-12-04T23:12:37.539021+0000 | compress | METRIC -

(8/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 213.66it/s]
(9/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.45it/s]

2025-12-04T23:12:58.806501+0000 | compress_modules | INFO - Quantizing model.layers.8.self_attn.q_proj using 128 samples





2025-12-04T23:13:01.311410+0000 | compress | METRIC - time 2.50s
2025-12-04T23:13:01.313833+0000 | compress | METRIC - error 165.87
2025-12-04T23:13:01.315528+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:13:01.316888+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:13:01.318102+0000 | compress_modules | INFO - Quantizing model.layers.8.self_attn.k_proj using 128 samples
2025-12-04T23:13:03.694102+0000 | compress | METRIC - time 2.37s
2025-12-04T23:13:03.695760+0000 | compress | METRIC - error 75.34
2025-12-04T23:13:03.697308+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:13:03.698793+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:13:03.700127+0000 | compress_modules | INFO - Quantizing model.layers.8.self_attn.v_proj using 128 samples
2025-12-04T23:13:06.067112+0000 | compress | METRIC - time 2.37s
2025-12-04T23:13:06.069684+0000 | compress | METRIC -

(9/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 213.75it/s]
(10/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.42it/s]

2025-12-04T23:13:27.167567+0000 | compress_modules | INFO - Quantizing model.layers.9.self_attn.q_proj using 128 samples





2025-12-04T23:13:29.638553+0000 | compress | METRIC - time 2.47s
2025-12-04T23:13:29.640991+0000 | compress | METRIC - error 248.41
2025-12-04T23:13:29.642384+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:13:29.643681+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:13:29.645234+0000 | compress_modules | INFO - Quantizing model.layers.9.self_attn.k_proj using 128 samples
2025-12-04T23:13:31.990251+0000 | compress | METRIC - time 2.34s
2025-12-04T23:13:31.992622+0000 | compress | METRIC - error 116.60
2025-12-04T23:13:31.994033+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:13:31.995300+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:13:31.996740+0000 | compress_modules | INFO - Quantizing model.layers.9.self_attn.v_proj using 128 samples
2025-12-04T23:13:34.374954+0000 | compress | METRIC - time 2.38s
2025-12-04T23:13:34.377475+0000 | compress | METRIC 

(10/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 217.24it/s]
(11/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.43it/s]

2025-12-04T23:13:55.461827+0000 | compress_modules | INFO - Quantizing model.layers.10.self_attn.q_proj using 128 samples





2025-12-04T23:13:57.937565+0000 | compress | METRIC - time 2.47s
2025-12-04T23:13:57.939826+0000 | compress | METRIC - error 206.97
2025-12-04T23:13:57.941221+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:13:57.942654+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:13:57.943977+0000 | compress_modules | INFO - Quantizing model.layers.10.self_attn.k_proj using 128 samples
2025-12-04T23:14:00.376826+0000 | compress | METRIC - time 2.43s
2025-12-04T23:14:00.379536+0000 | compress | METRIC - error 98.69
2025-12-04T23:14:00.380987+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:14:00.381988+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:14:00.383459+0000 | compress_modules | INFO - Quantizing model.layers.10.self_attn.v_proj using 128 samples
2025-12-04T23:14:02.799774+0000 | compress | METRIC - time 2.41s
2025-12-04T23:14:02.802352+0000 | compress | METRIC

(11/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 219.57it/s]
(12/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.41it/s]

2025-12-04T23:14:23.926168+0000 | compress_modules | INFO - Quantizing model.layers.11.self_attn.q_proj using 128 samples





2025-12-04T23:14:26.416906+0000 | compress | METRIC - time 2.49s
2025-12-04T23:14:26.419407+0000 | compress | METRIC - error 259.04
2025-12-04T23:14:26.420827+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:14:26.422078+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:14:26.423429+0000 | compress_modules | INFO - Quantizing model.layers.11.self_attn.k_proj using 128 samples
2025-12-04T23:14:28.805546+0000 | compress | METRIC - time 2.38s
2025-12-04T23:14:28.807977+0000 | compress | METRIC - error 119.52
2025-12-04T23:14:28.809464+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:14:28.810475+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:14:28.812187+0000 | compress_modules | INFO - Quantizing model.layers.11.self_attn.v_proj using 128 samples
2025-12-04T23:14:31.184794+0000 | compress | METRIC - time 2.37s
2025-12-04T23:14:31.187130+0000 | compress | METRI

(12/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 215.87it/s]
(13/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.44it/s]

2025-12-04T23:14:52.229934+0000 | compress_modules | INFO - Quantizing model.layers.12.self_attn.q_proj using 128 samples





2025-12-04T23:14:54.701697+0000 | compress | METRIC - time 2.47s
2025-12-04T23:14:54.703629+0000 | compress | METRIC - error 344.06
2025-12-04T23:14:54.705015+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:14:54.706140+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:14:54.707676+0000 | compress_modules | INFO - Quantizing model.layers.12.self_attn.k_proj using 128 samples
2025-12-04T23:14:57.087357+0000 | compress | METRIC - time 2.38s
2025-12-04T23:14:57.089707+0000 | compress | METRIC - error 155.68
2025-12-04T23:14:57.091172+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:14:57.092596+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:14:57.094120+0000 | compress_modules | INFO - Quantizing model.layers.12.self_attn.v_proj using 128 samples
2025-12-04T23:14:59.491146+0000 | compress | METRIC - time 2.40s
2025-12-04T23:14:59.493793+0000 | compress | METRI

(13/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 217.81it/s]
(14/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.36it/s]

2025-12-04T23:15:20.726359+0000 | compress_modules | INFO - Quantizing model.layers.13.self_attn.q_proj using 128 samples





2025-12-04T23:15:23.209877+0000 | compress | METRIC - time 2.48s
2025-12-04T23:15:23.211300+0000 | compress | METRIC - error 297.89
2025-12-04T23:15:23.212747+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:15:23.213881+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:15:23.215139+0000 | compress_modules | INFO - Quantizing model.layers.13.self_attn.k_proj using 128 samples
2025-12-04T23:15:25.603553+0000 | compress | METRIC - time 2.39s
2025-12-04T23:15:25.605318+0000 | compress | METRIC - error 146.09
2025-12-04T23:15:25.606861+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:15:25.608467+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:15:25.609913+0000 | compress_modules | INFO - Quantizing model.layers.13.self_attn.v_proj using 128 samples
2025-12-04T23:15:28.003711+0000 | compress | METRIC - time 2.39s
2025-12-04T23:15:28.006277+0000 | compress | METRI

(14/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 224.10it/s]
(15/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.41it/s]

2025-12-04T23:15:49.029011+0000 | compress_modules | INFO - Quantizing model.layers.14.self_attn.q_proj using 128 samples





2025-12-04T23:15:51.517932+0000 | compress | METRIC - time 2.49s
2025-12-04T23:15:51.520437+0000 | compress | METRIC - error 354.40
2025-12-04T23:15:51.521838+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:15:51.523211+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:15:51.524678+0000 | compress_modules | INFO - Quantizing model.layers.14.self_attn.k_proj using 128 samples
2025-12-04T23:15:53.913243+0000 | compress | METRIC - time 2.39s
2025-12-04T23:15:53.914808+0000 | compress | METRIC - error 151.67
2025-12-04T23:15:53.916136+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:15:53.917200+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:15:53.918876+0000 | compress_modules | INFO - Quantizing model.layers.14.self_attn.v_proj using 128 samples
2025-12-04T23:15:56.301731+0000 | compress | METRIC - time 2.38s
2025-12-04T23:15:56.304198+0000 | compress | METRI

(15/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 211.28it/s]
(16/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.40it/s]

2025-12-04T23:16:17.452054+0000 | compress_modules | INFO - Quantizing model.layers.15.self_attn.q_proj using 128 samples





2025-12-04T23:16:19.932714+0000 | compress | METRIC - time 2.48s
2025-12-04T23:16:19.934947+0000 | compress | METRIC - error 464.66
2025-12-04T23:16:19.936241+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:16:19.937653+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:16:19.939080+0000 | compress_modules | INFO - Quantizing model.layers.15.self_attn.k_proj using 128 samples
2025-12-04T23:16:22.355377+0000 | compress | METRIC - time 2.42s
2025-12-04T23:16:22.357064+0000 | compress | METRIC - error 206.02
2025-12-04T23:16:22.358432+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:16:22.359515+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:16:22.361188+0000 | compress_modules | INFO - Quantizing model.layers.15.self_attn.v_proj using 128 samples
2025-12-04T23:16:24.758291+0000 | compress | METRIC - time 2.40s
2025-12-04T23:16:24.760103+0000 | compress | METRI

(16/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 215.19it/s]
(17/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.42it/s]

2025-12-04T23:16:45.957293+0000 | compress_modules | INFO - Quantizing model.layers.16.self_attn.q_proj using 128 samples





2025-12-04T23:16:48.442329+0000 | compress | METRIC - time 2.48s
2025-12-04T23:16:48.444323+0000 | compress | METRIC - error 442.29
2025-12-04T23:16:48.445823+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:16:48.447069+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:16:48.448772+0000 | compress_modules | INFO - Quantizing model.layers.16.self_attn.k_proj using 128 samples
2025-12-04T23:16:50.855608+0000 | compress | METRIC - time 2.41s
2025-12-04T23:16:50.858313+0000 | compress | METRIC - error 204.49
2025-12-04T23:16:50.859686+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:16:50.860693+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:16:50.862358+0000 | compress_modules | INFO - Quantizing model.layers.16.self_attn.v_proj using 128 samples
2025-12-04T23:16:53.223918+0000 | compress | METRIC - time 2.36s
2025-12-04T23:16:53.226381+0000 | compress | METRI

(17/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 219.40it/s]
(18/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.26it/s]

2025-12-04T23:17:14.241010+0000 | compress_modules | INFO - Quantizing model.layers.17.self_attn.q_proj using 128 samples





2025-12-04T23:17:16.740218+0000 | compress | METRIC - time 2.50s
2025-12-04T23:17:16.743037+0000 | compress | METRIC - error 385.78
2025-12-04T23:17:16.744594+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:17:16.746072+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:17:16.747541+0000 | compress_modules | INFO - Quantizing model.layers.17.self_attn.k_proj using 128 samples
2025-12-04T23:17:19.115072+0000 | compress | METRIC - time 2.37s
2025-12-04T23:17:19.116647+0000 | compress | METRIC - error 161.72
2025-12-04T23:17:19.118191+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:17:19.119354+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:17:19.121314+0000 | compress_modules | INFO - Quantizing model.layers.17.self_attn.v_proj using 128 samples
2025-12-04T23:17:21.519843+0000 | compress | METRIC - time 2.40s
2025-12-04T23:17:21.522198+0000 | compress | METRI

(18/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 214.38it/s]
(19/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.32it/s]

2025-12-04T23:17:42.706227+0000 | compress_modules | INFO - Quantizing model.layers.18.self_attn.q_proj using 128 samples





2025-12-04T23:17:45.199564+0000 | compress | METRIC - time 2.49s
2025-12-04T23:17:45.202196+0000 | compress | METRIC - error 521.67
2025-12-04T23:17:45.203683+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:17:45.204871+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:17:45.206729+0000 | compress_modules | INFO - Quantizing model.layers.18.self_attn.k_proj using 128 samples
2025-12-04T23:17:47.560998+0000 | compress | METRIC - time 2.35s
2025-12-04T23:17:47.563525+0000 | compress | METRIC - error 203.91
2025-12-04T23:17:47.564799+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:17:47.565722+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:17:47.567508+0000 | compress_modules | INFO - Quantizing model.layers.18.self_attn.v_proj using 128 samples
2025-12-04T23:17:49.951804+0000 | compress | METRIC - time 2.38s
2025-12-04T23:17:49.954532+0000 | compress | METRI

(19/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 221.39it/s]
(20/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.41it/s]

2025-12-04T23:18:10.883871+0000 | compress_modules | INFO - Quantizing model.layers.19.self_attn.q_proj using 128 samples





2025-12-04T23:18:13.354188+0000 | compress | METRIC - time 2.47s
2025-12-04T23:18:13.356713+0000 | compress | METRIC - error 505.00
2025-12-04T23:18:13.358043+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:18:13.359308+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:18:13.360811+0000 | compress_modules | INFO - Quantizing model.layers.19.self_attn.k_proj using 128 samples
2025-12-04T23:18:15.723569+0000 | compress | METRIC - time 2.36s
2025-12-04T23:18:15.726158+0000 | compress | METRIC - error 214.48
2025-12-04T23:18:15.727514+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:18:15.728634+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:18:15.730159+0000 | compress_modules | INFO - Quantizing model.layers.19.self_attn.v_proj using 128 samples
2025-12-04T23:18:18.125379+0000 | compress | METRIC - time 2.39s
2025-12-04T23:18:18.128065+0000 | compress | METRI

(20/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 213.99it/s]
(21/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.33it/s]

2025-12-04T23:18:39.240914+0000 | compress_modules | INFO - Quantizing model.layers.20.self_attn.q_proj using 128 samples





2025-12-04T23:18:41.705283+0000 | compress | METRIC - time 2.46s
2025-12-04T23:18:41.708031+0000 | compress | METRIC - error 545.85
2025-12-04T23:18:41.709743+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:18:41.711069+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:18:41.712559+0000 | compress_modules | INFO - Quantizing model.layers.20.self_attn.k_proj using 128 samples
2025-12-04T23:18:44.109333+0000 | compress | METRIC - time 2.40s
2025-12-04T23:18:44.111815+0000 | compress | METRIC - error 222.91
2025-12-04T23:18:44.113490+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:18:44.114773+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:18:44.116116+0000 | compress_modules | INFO - Quantizing model.layers.20.self_attn.v_proj using 128 samples
2025-12-04T23:18:46.495000+0000 | compress | METRIC - time 2.38s
2025-12-04T23:18:46.497517+0000 | compress | METRI

(21/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 221.66it/s]
(22/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.37it/s]

2025-12-04T23:19:07.533539+0000 | compress_modules | INFO - Quantizing model.layers.21.self_attn.q_proj using 128 samples





2025-12-04T23:19:10.014613+0000 | compress | METRIC - time 2.48s
2025-12-04T23:19:10.017234+0000 | compress | METRIC - error 549.44
2025-12-04T23:19:10.019309+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:19:10.020670+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:19:10.022122+0000 | compress_modules | INFO - Quantizing model.layers.21.self_attn.k_proj using 128 samples
2025-12-04T23:19:12.426994+0000 | compress | METRIC - time 2.40s
2025-12-04T23:19:12.428791+0000 | compress | METRIC - error 222.72
2025-12-04T23:19:12.430506+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:19:12.431874+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:19:12.433271+0000 | compress_modules | INFO - Quantizing model.layers.21.self_attn.v_proj using 128 samples
2025-12-04T23:19:14.819570+0000 | compress | METRIC - time 2.39s
2025-12-04T23:19:14.822182+0000 | compress | METRI

(22/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 221.22it/s]
(23/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.37it/s]

2025-12-04T23:19:35.898445+0000 | compress_modules | INFO - Quantizing model.layers.22.self_attn.q_proj using 128 samples





2025-12-04T23:19:38.383995+0000 | compress | METRIC - time 2.48s
2025-12-04T23:19:38.386540+0000 | compress | METRIC - error 540.31
2025-12-04T23:19:38.388109+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:19:38.389289+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:19:38.390646+0000 | compress_modules | INFO - Quantizing model.layers.22.self_attn.k_proj using 128 samples
2025-12-04T23:19:40.764608+0000 | compress | METRIC - time 2.37s
2025-12-04T23:19:40.767145+0000 | compress | METRIC - error 214.90
2025-12-04T23:19:40.768576+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:19:40.769568+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:19:40.771131+0000 | compress_modules | INFO - Quantizing model.layers.22.self_attn.v_proj using 128 samples
2025-12-04T23:19:43.146044+0000 | compress | METRIC - time 2.37s
2025-12-04T23:19:43.148831+0000 | compress | METRI

(23/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 223.37it/s]
(24/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.44it/s]

2025-12-04T23:20:04.180436+0000 | compress_modules | INFO - Quantizing model.layers.23.self_attn.q_proj using 128 samples





2025-12-04T23:20:06.657643+0000 | compress | METRIC - time 2.48s
2025-12-04T23:20:06.660223+0000 | compress | METRIC - error 569.94
2025-12-04T23:20:06.661478+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:20:06.662781+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:20:06.664036+0000 | compress_modules | INFO - Quantizing model.layers.23.self_attn.k_proj using 128 samples
2025-12-04T23:20:09.047836+0000 | compress | METRIC - time 2.38s
2025-12-04T23:20:09.050463+0000 | compress | METRIC - error 223.63
2025-12-04T23:20:09.052162+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:20:09.053487+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:20:09.054787+0000 | compress_modules | INFO - Quantizing model.layers.23.self_attn.v_proj using 128 samples
2025-12-04T23:20:11.433164+0000 | compress | METRIC - time 2.38s
2025-12-04T23:20:11.435544+0000 | compress | METRI

(24/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 221.71it/s]
(25/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.43it/s]

2025-12-04T23:20:32.475229+0000 | compress_modules | INFO - Quantizing model.layers.24.self_attn.q_proj using 128 samples





2025-12-04T23:20:34.945102+0000 | compress | METRIC - time 2.47s
2025-12-04T23:20:34.947737+0000 | compress | METRIC - error 667.55
2025-12-04T23:20:34.949380+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:20:34.950663+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:20:34.952217+0000 | compress_modules | INFO - Quantizing model.layers.24.self_attn.k_proj using 128 samples
2025-12-04T23:20:37.335485+0000 | compress | METRIC - time 2.38s
2025-12-04T23:20:37.338187+0000 | compress | METRIC - error 266.92
2025-12-04T23:20:37.339726+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:20:37.340950+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:20:37.342293+0000 | compress_modules | INFO - Quantizing model.layers.24.self_attn.v_proj using 128 samples
2025-12-04T23:20:39.712356+0000 | compress | METRIC - time 2.37s
2025-12-04T23:20:39.715292+0000 | compress | METRI

(25/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 217.31it/s]
(26/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.30it/s]

2025-12-04T23:21:00.730083+0000 | compress_modules | INFO - Quantizing model.layers.25.self_attn.q_proj using 128 samples





2025-12-04T23:21:03.191439+0000 | compress | METRIC - time 2.46s
2025-12-04T23:21:03.194053+0000 | compress | METRIC - error 686.76
2025-12-04T23:21:03.195411+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:21:03.196675+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:21:03.198022+0000 | compress_modules | INFO - Quantizing model.layers.25.self_attn.k_proj using 128 samples
2025-12-04T23:21:05.564135+0000 | compress | METRIC - time 2.37s
2025-12-04T23:21:05.566589+0000 | compress | METRIC - error 263.90
2025-12-04T23:21:05.567856+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:21:05.569049+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:21:05.570819+0000 | compress_modules | INFO - Quantizing model.layers.25.self_attn.v_proj using 128 samples
2025-12-04T23:21:07.952324+0000 | compress | METRIC - time 2.38s
2025-12-04T23:21:07.954795+0000 | compress | METRI

(26/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 220.64it/s]
(27/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.40it/s]

2025-12-04T23:21:29.036252+0000 | compress_modules | INFO - Quantizing model.layers.26.self_attn.q_proj using 128 samples





2025-12-04T23:21:31.468738+0000 | compress | METRIC - time 2.43s
2025-12-04T23:21:31.471381+0000 | compress | METRIC - error 644.13
2025-12-04T23:21:31.472882+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:21:31.474003+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:21:31.475518+0000 | compress_modules | INFO - Quantizing model.layers.26.self_attn.k_proj using 128 samples
2025-12-04T23:21:33.832506+0000 | compress | METRIC - time 2.36s
2025-12-04T23:21:33.834119+0000 | compress | METRIC - error 244.67
2025-12-04T23:21:33.835540+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:21:33.836672+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:21:33.838019+0000 | compress_modules | INFO - Quantizing model.layers.26.self_attn.v_proj using 128 samples
2025-12-04T23:21:36.281576+0000 | compress | METRIC - time 2.44s
2025-12-04T23:21:36.284390+0000 | compress | METRI

(27/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 216.68it/s]
(28/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.39it/s]

2025-12-04T23:21:57.168102+0000 | compress_modules | INFO - Quantizing model.layers.27.self_attn.q_proj using 128 samples





2025-12-04T23:21:59.640698+0000 | compress | METRIC - time 2.47s
2025-12-04T23:21:59.643518+0000 | compress | METRIC - error 719.42
2025-12-04T23:21:59.645150+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:21:59.646203+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:21:59.647791+0000 | compress_modules | INFO - Quantizing model.layers.27.self_attn.k_proj using 128 samples
2025-12-04T23:22:02.041773+0000 | compress | METRIC - time 2.39s
2025-12-04T23:22:02.044294+0000 | compress | METRIC - error 267.55
2025-12-04T23:22:02.045615+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:22:02.046711+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:22:02.048167+0000 | compress_modules | INFO - Quantizing model.layers.27.self_attn.v_proj using 128 samples
2025-12-04T23:22:04.415722+0000 | compress | METRIC - time 2.37s
2025-12-04T23:22:04.418253+0000 | compress | METRI

(28/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 223.16it/s]
(29/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.36it/s]

2025-12-04T23:22:25.399087+0000 | compress_modules | INFO - Quantizing model.layers.28.self_attn.q_proj using 128 samples





2025-12-04T23:22:27.850577+0000 | compress | METRIC - time 2.45s
2025-12-04T23:22:27.853391+0000 | compress | METRIC - error 843.90
2025-12-04T23:22:27.854859+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:22:27.856201+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:22:27.857593+0000 | compress_modules | INFO - Quantizing model.layers.28.self_attn.k_proj using 128 samples
2025-12-04T23:22:30.234842+0000 | compress | METRIC - time 2.38s
2025-12-04T23:22:30.237495+0000 | compress | METRIC - error 322.59
2025-12-04T23:22:30.239027+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:22:30.240408+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:22:30.241761+0000 | compress_modules | INFO - Quantizing model.layers.28.self_attn.v_proj using 128 samples
2025-12-04T23:22:32.618204+0000 | compress | METRIC - time 2.38s
2025-12-04T23:22:32.620749+0000 | compress | METRI

(29/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 219.35it/s]
(30/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.42it/s]

2025-12-04T23:22:53.574164+0000 | compress_modules | INFO - Quantizing model.layers.29.self_attn.q_proj using 128 samples





2025-12-04T23:22:56.016754+0000 | compress | METRIC - time 2.44s
2025-12-04T23:22:56.019376+0000 | compress | METRIC - error 925.16
2025-12-04T23:22:56.020980+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:22:56.022057+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:22:56.023740+0000 | compress_modules | INFO - Quantizing model.layers.29.self_attn.k_proj using 128 samples
2025-12-04T23:22:58.412722+0000 | compress | METRIC - time 2.39s
2025-12-04T23:22:58.415501+0000 | compress | METRIC - error 312.04
2025-12-04T23:22:58.417062+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:22:58.418502+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:22:58.419924+0000 | compress_modules | INFO - Quantizing model.layers.29.self_attn.v_proj using 128 samples
2025-12-04T23:23:00.815643+0000 | compress | METRIC - time 2.39s
2025-12-04T23:23:00.818339+0000 | compress | METRI

(30/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 220.60it/s]
(31/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.41it/s]

2025-12-04T23:23:21.761327+0000 | compress_modules | INFO - Quantizing model.layers.30.self_attn.q_proj using 128 samples





2025-12-04T23:23:24.215318+0000 | compress | METRIC - time 2.45s
2025-12-04T23:23:24.218008+0000 | compress | METRIC - error 845.06
2025-12-04T23:23:24.219314+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:23:24.220628+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:23:24.222183+0000 | compress_modules | INFO - Quantizing model.layers.30.self_attn.k_proj using 128 samples
2025-12-04T23:23:26.575071+0000 | compress | METRIC - time 2.35s
2025-12-04T23:23:26.577576+0000 | compress | METRIC - error 281.46
2025-12-04T23:23:26.578859+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:23:26.579935+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:23:26.581158+0000 | compress_modules | INFO - Quantizing model.layers.30.self_attn.v_proj using 128 samples
2025-12-04T23:23:28.943288+0000 | compress | METRIC - time 2.36s
2025-12-04T23:23:28.945839+0000 | compress | METRI

(31/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 220.83it/s]
(32/33): Calibrating: 100%|██████████| 128/128 [00:03<00:00, 41.36it/s]

2025-12-04T23:23:50.005611+0000 | compress_modules | INFO - Quantizing model.layers.31.self_attn.q_proj using 128 samples





2025-12-04T23:23:52.442452+0000 | compress | METRIC - time 2.44s
2025-12-04T23:23:52.444910+0000 | compress | METRIC - error 769.54
2025-12-04T23:23:52.446292+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:23:52.447629+0000 | compress | METRIC - Compressed module size: 33.947648 MB
2025-12-04T23:23:52.449126+0000 | compress_modules | INFO - Quantizing model.layers.31.self_attn.k_proj using 128 samples
2025-12-04T23:23:54.793516+0000 | compress | METRIC - time 2.34s
2025-12-04T23:23:54.796021+0000 | compress | METRIC - error 265.16
2025-12-04T23:23:54.797297+0000 | compress | METRIC - GPU 0 | usage: 44.84% | total memory: 85 GB
2025-12-04T23:23:54.798548+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2025-12-04T23:23:54.799924+0000 | compress_modules | INFO - Quantizing model.layers.31.self_attn.v_proj using 128 samples
2025-12-04T23:23:57.127524+0000 | compress | METRIC - time 2.33s
2025-12-04T23:23:57.129983+0000 | compress | METRI

(32/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 175.91it/s]
(33/33): Calibrating: 100%|██████████| 128/128 [00:00<00:00, 497.81it/s]
(33/33): Propagating: 100%|██████████| 128/128 [00:00<00:00, 669.58it/s]


2025-12-04T23:24:15.958431+0000 | finalize | INFO - Compression lifecycle finalized for 1 modifiers
2025-12-04T23:24:16.013240+0000 | get_model_compressor | INFO - skip_sparsity_compression_stats set to True. Skipping sparsity compression statistic calculations. No sparsity compressor will be applied.


Compressing model: 224it [00:30,  7.23it/s]


Loading GPTQ Model...


Compressing model: 224it [00:00, 928.58it/s]


--- Evaluating: GPTQ Standard ---


MMLU Eval: 100%|██████████| 200/200 [00:46<00:00,  4.30it/s]
Computing PPL:   0%|          | 0/4 [00:00<?, ?it/s]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Results -> Acc: 22.00%, PPL: 4.06, Latency: 11.01s, Mem: 34.31GB


0,1
Accuracy,▁
Flip_Rate,▁
Latency,▁
Memory,▁
Perplexity,▁

0,1
Accuracy,0.22
Flip_Rate,0.0
Latency,11.00744
Memory,34.307
Perplexity,4.05763


Experiment E Complete.


# Visualization & Reporting

QWen:
```

                        Model         Method  Threshold    Acc   Flip  \
0  Qwen/Qwen2.5-0.5B-Instruct  FP16 Baseline       0.00  0.250  0.000   
1  Qwen/Qwen2.5-0.5B-Instruct        KLD-NF4       0.00  0.265  0.630   
2  Qwen/Qwen2.5-0.5B-Instruct        KLD-NF4       0.01  0.265  0.630   
3  Qwen/Qwen2.5-0.5B-Instruct        KLD-NF4       0.05  0.280  0.430   
4  Qwen/Qwen2.5-0.5B-Instruct        KLD-NF4       0.10  0.280  0.395   
5  Qwen/Qwen2.5-0.5B-Instruct        KLD-NF4       0.20  0.290  0.305   
6  Qwen/Qwen2.5-0.5B-Instruct       KLD-Int8       0.05  0.250  0.155   
7  Qwen/Qwen2.5-0.5B-Instruct     Mixed-2bit       0.05  0.225  0.465   
8  Qwen/Qwen2.5-0.5B-Instruct            AWQ       0.00  0.240  0.430   
9  Qwen/Qwen2.5-0.5B-Instruct           GPTQ       0.00  0.205  0.385   

             PPL   Latency       Mem  
0       8.350414  2.043066  0.942955  
1       8.855442  2.643480  0.458393  
2       8.855442  2.512811  0.711964  
3       8.830623  2.543613  0.949767  
4       8.774139  2.509315  1.014822  
5       8.657179  2.484945  1.139766  
6       8.353897  7.321452  1.071951  
7  537496.375000  1.837433  1.131078  
8      12.012968  5.784297  3.139405  
9       9.055728  4.997509  1.655096

```



In [None]:
# View Results Table
df = pd.DataFrame(results_table)
print(df)

                                Model         Method  Threshold   Acc  Flip  \
0  mistralai/Mistral-7B-Instruct-v0.2  FP16 Baseline       0.00  0.22   0.0   
1  mistralai/Mistral-7B-Instruct-v0.2        KLD-NF4       0.00  0.22   0.0   
2  mistralai/Mistral-7B-Instruct-v0.2        KLD-NF4       0.01  0.22   0.0   
3  mistralai/Mistral-7B-Instruct-v0.2        KLD-NF4       0.05  0.22   0.0   
4  mistralai/Mistral-7B-Instruct-v0.2        KLD-NF4       0.10  0.22   0.0   
5  mistralai/Mistral-7B-Instruct-v0.2        KLD-NF4       0.20  0.22   0.0   
6  mistralai/Mistral-7B-Instruct-v0.2       KLD-Int8       0.05  0.22   0.0   
7  mistralai/Mistral-7B-Instruct-v0.2     Mixed-2bit       0.05  0.22   0.0   
8  mistralai/Mistral-7B-Instruct-v0.2            AWQ       0.00  0.22   0.0   
9  mistralai/Mistral-7B-Instruct-v0.2           GPTQ       0.00  0.22   0.0   

            PPL    Latency        Mem  
0      4.018649   2.021485  13.503991  
1      4.207792   2.611904   4.279063  
2      4.1

In [None]:
# Save Results to Google Drive
from google.colab import drive
import pandas as pd
import os

# 1. Mount Google Drive
# This will trigger a popup asking for permission
drive.mount('/content/drive')

ValueError: mount failed

In [None]:
# 2. Define Filename
# We include the model name to avoid overwriting previous results
model_name = CURRENT_MODEL_ID.split('/')[-1] if 'CURRENT_MODEL_ID' in globals() else "experiment"
filename = f"//content/drive/MyDrive/Columbia-LLMSeminar/SLLM project/Mena/{model_name}_1204results.csv"

# 3. Create Directory if it doesn't exist
os.makedirs(os.path.dirname(filename), exist_ok=True)

# 4. Save
if results_table:
    df = pd.DataFrame(results_table)
    df.to_csv(filename, index=False)
    print(f"✅ Success! Results saved to Google Drive at:\n{filename}")
else:
    print("⚠️ Warning: results_table is empty. Nothing to save.")

In [None]:
# Log Final Summary to WandB

print("\n--- Uploading Final Report to Weights & Biases ---")

# 1. Initialize a generic "Summary" run
run = wandb.init(
    project=WANDB_PROJECT_NAME,
    name="Final-Summary-Report",
    job_type="report"
)

# 2. Upload the Master Data Table
# This allows you to query/sort your results in the WandB UI
if results_table:
    tbl = wandb.Table(dataframe=pd.DataFrame(results_table))
    wandb.log({"Experiment_Results_Raw": tbl})

# 3. Upload the Matplotlib Images (from Cell 7)
# This saves the static PNGs you just generated into the cloud
import os
image_files = ['graph1_sweet_spot.png', 'graph2_flip_rate.png', 'graph3_efficiency.png']

for img_file in image_files:
    if os.path.exists(img_file):
        wandb.log({img_file.replace(".png", ""): wandb.Image(img_file)})
        print(f"Uploaded {img_file}")
    else:
        print(f"Warning: {img_file} not found. Did you run Cell 7?")

# 4. Create an Interactive Custom Chart (Efficiency Frontier)
# This creates a native WandB chart where you can hover over dots to see model details
if results_table:
    data = [[r['Method'], r['Threshold'], r['Mem'], r['PPL']] for r in results_table]
    table = wandb.Table(data=data, columns=["Method", "Threshold", "Memory", "Perplexity"])

    # Custom Scatter Plot definition
    wandb.log({
        "Efficiency_Frontier_Interactive": wandb.plot.scatter(
            table, "Memory", "Perplexity", title="Efficiency Frontier (Interactive)"
        )
    })

run.finish()
print("Upload Complete. Check your WandB Dashboard!")

In [None]:
# Visualization & Reporting
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Prepare Data
if not results_table:
    print("No results to plot! Run the experiments first.")
else:
    df = pd.DataFrame(results_table)

    # Optional: Save raw data to CSV for your paper
    df.to_csv("experiment_results.csv", index=False)
    print("Results saved to 'experiment_results.csv'")

    # Filter for the model we just tested (or select the first one available)
    target_model = CURRENT_MODEL_ID if 'CURRENT_MODEL_ID' in globals() else df['Model'].unique()[0]
    model_df = df[df['Model'] == target_model]

    print(f"\nGenerating Plots for: {target_model}")

    # Set Style
    sns.set_theme(style="whitegrid")
    plt.rcParams.update({'font.size': 12})

    # ==============================================================================
    # Graph 1: The "Sweet Spot" (Accuracy vs. Threshold)
    # Goal: Show that 5% FP16 recovery beats the Standard NF4 baseline
    # ==============================================================================
    plt.figure(figsize=(10, 6))

    # Filter for KLD-NF4 data points
    nf4_data = model_df[model_df['Method'] == 'KLD-NF4'].sort_values('Threshold')

    # Plot the KLD Curve
    sns.lineplot(data=nf4_data, x='Threshold', y='Acc', marker='o', label='KLD-Guided NF4', linewidth=2.5)

    # Plot Baseline Reference (FP16)
    baseline_acc = model_df[model_df['Method'] == 'FP16 Baseline']['Acc'].values[0]
    plt.axhline(y=baseline_acc, color='green', linestyle='--', label=f'FP16 Baseline ({baseline_acc:.1%})')

    # Formatting
    plt.title(f'The Sweet Spot: Accuracy vs. FP16 Retention ({target_model})', fontsize=14)
    plt.xlabel('Percentage of Layers Kept in FP16', fontsize=12)
    plt.ylabel('MMLU Accuracy', fontsize=12)
    plt.legend()

    # Fix X-Axis to show percentages nicely
    plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:.0%}'))
    plt.tight_layout()
    plt.savefig('graph1_sweet_spot.png', dpi=300)
    plt.show()

    # ==============================================================================
    # Graph 2: Flip Rate Reduction (The "Stability" Metric)
    # Goal: Show that KLD-Guided significantly reduces answer flips compared to 0%
    # ==============================================================================
    plt.figure(figsize=(10, 6))

    # Plot Flip Rate Curve
    sns.barplot(data=nf4_data, x='Threshold', y='Flip', hue='Threshold', palette="viridis", legend=False)

    # Formatting
    plt.title(f'Impact of KLD Guidance on Output Stability', fontsize=14)
    plt.xlabel('Percentage of Layers Kept in FP16', fontsize=12)
    plt.ylabel('Flip Rate (Lower is Better)', fontsize=12)

    # Fix Y-Axis to percentage
    plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y:.0%}'))

    plt.tight_layout()
    plt.savefig('graph2_flip_rate.png', dpi=300)
    plt.show()

    # ==============================================================================
    # Graph 3: Efficiency Frontier (Perplexity vs. Memory)
    # Goal: Compare ALL methods (NF4, Int8, AWQ, Mixed-2bit)
    # Ideal Position: Bottom-Left Corner (Low Memory, Low Perplexity)
    # ==============================================================================
    plt.figure(figsize=(10, 7))

    # Create Scatter Plot
    # We remove the Baseline from this plot if it skews the scale too much,
    # but usually it's good to keep it to show the Memory gap.
    sns.scatterplot(
        data=model_df,
        x='Mem',
        y='PPL',
        hue='Method',
        style='Method',
        s=200, # Marker size
        alpha=0.8
    )

    # Annotate points
    for i in range(model_df.shape[0]):
        row = model_df.iloc[i]
        plt.text(
            row.Mem + 0.02,
            row.PPL + 0.02,
            f"{row.Method}\n({row.Threshold:.0%})",
            fontsize=9
        )

    # Formatting
    plt.title(f'Efficiency Frontier: Memory vs. Perplexity', fontsize=14)
    plt.xlabel('Memory Usage (GB)', fontsize=12)
    plt.ylabel('Perplexity (Lower is Better)', fontsize=12)
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

    plt.tight_layout()
    plt.savefig('graph3_efficiency.png', dpi=300)
    plt.show()

    # ==============================================================================
    # Final Data Table
    # ==============================================================================
    print("\n=== Final Master Results Table ===")
    # Reorder columns for readability
    cols = ['Model', 'Method', 'Threshold', 'Acc', 'Flip', 'PPL', 'Mem']
    display_df = model_df[cols].sort_values(['Method', 'Threshold'])
    print(display_df.to_string(index=False))