# **Final Project Notebook**

## **Setup & Dependencies**

In [None]:
!pip uninstall transformers torch torchaudio torchvision wandb -y
!pip install llmcompressor
!pip install -q accelerate bitsandbytes datasets scipy matplotlib wandb

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset, concatenate_datasets
from datasets import Dataset
import copy
import gc
import time
from tqdm import tqdm
import shutil
import wandb

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
# Set for reproducibility
import random
import numpy as np
from transformers import set_seed

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
set_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## **Configuration & Experiment Controls**

In [None]:
# --- Experiment Settings ---
TEST_RUN = True
if TEST_RUN:
  MODELS_TO_TEST = ["Qwen/Qwen2.5-0.5B-Instruct"]
  SENSITIVITY_THRESHOLDS = [0.0, 0.05]
  CALIBRATION_SAMPLES = 128
  EVAL_SAMPLES = 200
  ENABLE_FP8_COMPARISON = False
  ENABLE_BLOCK_WISE = False
  WANDB_PROJECT_NAME = "Test_Run"
else:
  MODELS_TO_TEST = ["Qwen/Qwen2.5-1.5B-Instruct", "Qwen/Qwen2.5-3B-Instruct", "Qwen/Qwen2.5-7B-Instruct"]
  SENSITIVITY_THRESHOLDS = [0.0, 0.01, 0.05, 0.10, 0.20]
  CALIBRATION_SAMPLES = 128
  EVAL_SAMPLES = 200 # Keep small for fast iteration, increase for final paper
  ENABLE_FP8_COMPARISON = True
  ENABLE_BLOCK_WISE = True
  WANDB_PROJECT_NAME = "KLD_Quantization_Project"

## **Metrics & Helper Functions**

In [None]:
def recursive_getattr(obj, attr):
    for part in attr.split('.'):
        obj = getattr(obj, part)
    return obj

def recursive_setattr(obj, attr, val):
    pre, _, post = attr.rpartition('.')
    parent = recursive_getattr(obj, pre) if pre else obj
    setattr(parent, post, val)

In [None]:
# --- Metrics Helpers ---
def compute_kld(logits_p, logits_q):
    p_probs = F.softmax(logits_p, dim=-1)
    q_log_probs = F.log_softmax(logits_q, dim=-1)
    return nn.KLDivLoss(reduction='batchmean')(q_log_probs, p_probs).item()

def calculate_flip_rate(base_preds, new_preds):
    """Calculates % of answers that changed from the baseline."""
    if not base_preds or not new_preds: return 0.0
    flips = sum([1 for b, n in zip(base_preds, new_preds) if b != n])
    return flips / len(base_preds)

def compute_perplexity(model, tokenizer):
    """Computes perplexity on a subset of WikiText-2"""
    encodings = tokenizer("\n\n".join(load_dataset("wikitext", "wikitext-2-raw-v1", split="test")["text"][:20]), return_tensors="pt")
    max_length = model.config.max_position_embeddings
    stride = 512
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride), desc="Computing PPL"):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            nlls.append(outputs.loss)

        prev_end_loc = end_loc
        if end_loc == seq_len: break

    return torch.exp(torch.stack(nlls).mean()).item()

def measure_efficiency(model, tokenizer, input_text="Hello world"):
    """Measures Inference Latency and Peak VRAM Usage"""
    input_ids = tokenizer(input_text, return_tensors="pt").to(device)
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

    start_time = time.time()
    with torch.no_grad():
        # Generate 50 tokens to average out overhead
        _ = model.generate(**input_ids, max_new_tokens=50, min_new_tokens=50)
    end_time = time.time()

    peak_mem = torch.cuda.max_memory_allocated() / 1024**3 # GB
    latency = (end_time - start_time) # Seconds
    return latency, peak_mem

def evaluate_full_suite(model, tokenizer, dataset, metric_name):
    """Runs all metrics and returns them."""
    print(f"--- Evaluating: {metric_name} ---")

    # 1. Accuracy
    preds, truths = get_mmlu_predictions(model, dataset, EVAL_SAMPLES)
    acc = sum([1 for p, g in zip(preds, truths) if p == g]) / len(truths)

    # 2. Perplexity
    ppl = compute_perplexity(model, tokenizer)

    # 3. Efficiency
    lat, mem = measure_efficiency(model, tokenizer)

    print(f"Results -> Acc: {acc:.2%}, PPL: {ppl:.2f}, Latency: {lat:.2f}s, Mem: {mem:.2f}GB")
    return acc, ppl, lat, mem, preds

In [None]:
# --- MMLU Logic ---
def format_mmlu_prompt(example):
    options = [f"{label}. {example['choices'][i]}" for i, label in enumerate(['A', 'B', 'C', 'D'])]
    prompt_text = f"Question: {example['question']}\nOptions:\n" + "\n".join(options) + "\nAnswer:"
    messages = [
        {"role": "system", "content": "Output only the single letter (A, B, C, or D) corresponding to the correct answer."},
        {"role": "user", "content": prompt_text}
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

def get_mmlu_predictions(model, dataset, num_samples):
    predictions, ground_truths = [], []
    choices = ["A", "B", "C", "D"]
    choice_ids = [tokenizer.encode(c)[0] for c in choices]

    for i in tqdm(range(min(num_samples, len(dataset))), desc="MMLU Eval"):
        ex = dataset[i]
        inputs = tokenizer(format_mmlu_prompt(ex), return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits[0, -1, choice_ids]
            pred = choices[torch.argmax(logits).item()]
        predictions.append(pred)
        ground_truths.append(choices[ex['answer']])
    return predictions, ground_truths

## **Advanced Sensitivity Profiling**

In [None]:
def profile_restoration_sensitivity(model_q, model_ref, calib_input, granularity='layer'):
    """
    Profiles sensitivity by measuring the KLD improvement when restoring
    individual parts of the quantized model (model_q) back to FP16 (model_ref).

    Returns:
        sensitivity_scores: Dict mapping name -> KLD improvement (Higher is more sensitive).
    """
    print(f"Profiling Restoration Sensitivity (Granularity: {granularity})...")

    # Compute Baseline
    model_ref.eval()

    with torch.no_grad():
        ref_device = next(model_ref.parameters()).device
        base_logits = model_ref(calib_input.to(ref_device)).logits.to(device)
        current_logits = model_q(calib_input.to(device)).logits
        initial_kld = compute_kld(base_logits, current_logits)

    print(f"Initial Quantized KLD: {initial_kld:.6f}")

    sensitivity_scores = {}

    def get_module_by_name(module, access_string):
        names = access_string.split(sep='.')
        return reduce(getattr, names, module)

    from functools import reduce

    # Block-wise or Layer-wise
    if granularity == 'block':
        if hasattr(model_q, 'model') and hasattr(model_q.model, 'layers'):
            iterable_items = list(enumerate(model_q.model.layers))
            prefix = "model.model.layers"
        else:
            raise ValueError("Could not detect transformer blocks structure.")
        iterator = tqdm(iterable_items, desc="Profiling Blocks")
    elif granularity == 'layer':
        # # We limit this to just the linear layers to save time
        # iterable_items = [(n, m) for n, m in model_q.named_modules() if isinstance(m, (nn.Linear,  import_bnb_linear_type_if_needed()))]
        iterable_items = [(n, m) for n, m in model_q.named_modules()
                          if "mlp" in n or "self_attn" in n]
        iterator = tqdm(iterable_items, desc="Profiling Layers")

    # Restoration Loop
    for name_or_idx, module_q in iterator:
        target_name = f"{prefix}.{name_or_idx}" if granularity == 'block' else name_or_idx
        try:
            module_ref = recursive_getattr(model_ref, target_name)
            backup_quant_module = recursive_getattr(model_q, target_name)
            module_fp16_gpu = copy.deepcopy(module_ref).to(device)
            recursive_setattr(model_q, target_name, module_fp16_gpu)

            # Measure New KLD
            with torch.no_grad():
                new_logits = model_q(calib_input.to(device)).logits
                new_kld = compute_kld(base_logits, new_logits)

            improvement = initial_kld - new_kld
            sensitivity_scores[target_name] = improvement
            recursive_setattr(model_q, target_name, backup_quant_module)

            # Cleanup VRAM
            del module_fp16_gpu

        except Exception as e:
            print(f"Skipping {target_name}: {e}")

    return sensitivity_scores

## **The "Surgery" Implementation**

In [None]:
def perform_surgery(model, sensitive_names, fp16_model_cpu):
    """
    Replaces the sensitive quantized layers in 'model' (GPU)
    with the original FP16 layers from 'fp16_model_cpu' (CPU).
    """
    count = 0
    print(f"Surgery: Replacing {len(sensitive_names)} Sensitive Layers with FP16...")

    for name in sensitive_names:
        try:
            # 1. Get original FP16 weight from CPU backup
            original_layer = recursive_getattr(fp16_model_cpu, name)

            # 2. Create new Linear layer on GPU
            new_layer = nn.Linear(
                in_features=original_layer.in_features,
                out_features=original_layer.out_features,
                bias=(original_layer.bias is not None)
            )
            new_layer.weight.data = original_layer.weight.data.to(model.device)
            if original_layer.bias is not None:
                new_layer.bias.data = original_layer.bias.data.to(model.device)

            # 3. Swap into the quantized model
            recursive_setattr(model, name, new_layer)
            count += 1
        except Exception as e:
            print(f"Skipping layer {name}: {e}")

    print(f"Surgery Complete: {count} layers restored.")

In [None]:
def perform_surgery(model, sensitive_names, fp16_model_cpu):
    """
    Replaces the sensitive quantized layers in 'model' (GPU)
    with the original FP16 layers from 'fp16_model_cpu' (CPU).

    This Generic Version uses deepcopy, so it works for:
    - Individual Linear layers (gate_proj, q_proj)
    - Entire Blocks (Qwen2MLP, Qwen2Attention)
    """
    count = 0
    print(f"Surgery: Replacing {len(sensitive_names)} Sensitive Layers with FP16...")

    for name in sensitive_names:
        try:
            # 1. Get original FP16 module from CPU backup
            #    (This handles Linear, Qwen2MLP, Qwen2Attention, etc.)
            original_module = recursive_getattr(fp16_model_cpu, name)

            # 2. Create a deep copy and move to GPU
            #    We use deepcopy instead of manually instantiating nn.Linear.
            #    This preserves the exact class type and configuration.
            module_fp16_gpu = copy.deepcopy(original_module).to(model.device)

            # 3. Swap into the quantized model
            recursive_setattr(model, name, module_fp16_gpu)

            count += 1

        except Exception as e:
            print(f"Skipping layer {name}: {e}")

    print(f"Surgery Complete: {count} layers restored.")

## **Main Experiment Loop**

In [None]:
import wandb
import pandas as pd
from datasets import load_dataset, concatenate_datasets
import os
os.environ["WANDB_QUIET"] = "true"

wandb.login()

if 'results_table' not in globals():
    results_table = []

print("Loading MMLU Dataset...")
# We use 'elementary_mathematics' as the subset for this project
try:
    mmlu_dataset = concatenate_datasets([
        load_dataset("cais/mmlu", "elementary_mathematics", split='test')
    ])
    print(f"MMLU Dataset Loaded. Size: {len(mmlu_dataset)} samples.")
except Exception as e:
    print(f"Error loading MMLU: {e}")
    from datasets import Dataset
    mmlu_dataset = Dataset.from_dict({
        "question": ["1+1=?"], "choices": [["1", "2", "3", "4"]], "answer": [1]
    })

print("Global setup complete. Ready for Step 2.")

In [None]:
# Model Selection & Baseline Evaluation

# Select model
CURRENT_MODEL_ID = MODELS_TO_TEST[0]

print(f"{'='*40}\nSelected Model: {CURRENT_MODEL_ID}\n{'='*40}")

tokenizer = AutoTokenizer.from_pretrained(CURRENT_MODEL_ID)
print("Loading FP16 Baseline (This may take a minute)...")
model_fp16 = AutoModelForCausalLM.from_pretrained(
    CURRENT_MODEL_ID,
    dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# Evaluate Baseline
base_acc, base_ppl, base_lat, base_mem, base_preds = evaluate_full_suite(
    model_fp16, tokenizer, mmlu_dataset, "FP16 Baseline"
)

# Log Baseline to WandB
run = wandb.init(project=WANDB_PROJECT_NAME, name=f"{CURRENT_MODEL_ID.split('/')[-1]}-Baseline", reinit=True)
wandb.log({
    "Accuracy": base_acc,
    "Perplexity": base_ppl,
    "Latency": base_lat,
    "Memory": base_mem,
    "Threshold": 0,
    "Flip_Rate": 0.0,
    "Method": "Baseline"
})
run.finish()

# Store in Results Table
results_table.append({
    "Model": CURRENT_MODEL_ID,
    "Method": "FP16 Baseline",
    "Threshold": 0,
    "Acc": base_acc,
    "Flip": 0.0,
    "PPL": base_ppl,
    "Latency": base_lat,
    "Mem": base_mem
})

print("Baseline Loaded & Evaluated.")

In [None]:
# Profiling & Offloading
print("Preparing Calibration Data...")
calib_data = tokenizer(
    "\n\n".join(load_dataset("wikitext", "wikitext-2-raw-v1", split="test")["text"][:10]),
    return_tensors="pt"
).input_ids.to(device)

granularity_mode = 'block' if ENABLE_BLOCK_WISE else 'layer'

# Offload FP16 Model to CPU to save memory
print("Moving FP16 model to CPU to free up VRAM...")
model_fp16.cpu()
torch.cuda.empty_cache()
print("VRAM Cleared. Ready for Experiments.")

In [None]:
# Experiment A: NF4

print(f"\n--- Starting Experiment A: NF4 {CURRENT_MODEL_ID} ---")

print("Loading NF4 Model for Profiling & Incremental Surgery...")
model_nf4 = AutoModelForCausalLM.from_pretrained(
    CURRENT_MODEL_ID,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    ),
    device_map="auto",
    trust_remote_code=True
)

sensitivity_map = profile_restoration_sensitivity(
    model_q=model_nf4,
    model_ref=model_fp16,
    calib_input=calib_data,
    granularity='block' if ENABLE_BLOCK_WISE else 'layer'
)
sorted_layers = sorted(sensitivity_map.items(), key=lambda x: x[1], reverse=True)
all_layer_names = [n for n, s in sorted_layers]

# Experiment loop
sorted_thresholds = sorted(SENSITIVITY_THRESHOLDS)
current_restored_count = 0

for threshold in sorted_thresholds:
    print(f"\nTargeting Threshold: {threshold:.0%} kept in FP16")

    target_count = int(len(all_layer_names) * threshold)

    layers_to_fix_now = all_layer_names[current_restored_count : target_count]

    if layers_to_fix_now:
        print(f"Restoring {len(layers_to_fix_now)} additional layers...")
        perform_surgery(model_nf4, layers_to_fix_now, model_fp16)
        current_restored_count = target_count
    else:
        print("No new layers to restore for this step.")

    run = wandb.init(
        project=WANDB_PROJECT_NAME,
        name=f"{CURRENT_MODEL_ID.split('/')[-1]}-NF4-{threshold}",
        config={"model": CURRENT_MODEL_ID, "threshold": threshold, "method": "KLD-NF4"},
        reinit=True
    )

    acc, ppl, lat, mem, preds = evaluate_full_suite(
        model_nf4, tokenizer, mmlu_dataset, f"KLD-NF4-{threshold}"
    )

    flip = calculate_flip_rate(base_preds, preds)

    wandb.log({
        "Accuracy": acc, "Perplexity": ppl, "Latency": lat,
        "Memory": mem, "Flip_Rate": flip, "Threshold": threshold
    })

    results_table.append({
        "Model": CURRENT_MODEL_ID,
        "Method": "KLD-NF4",
        "Threshold": threshold,
        "Acc": acc,
        "Flip": flip,
        "PPL": ppl,
        "Latency": lat,
        "Mem": mem
    })

    run.finish()

# Cleanup
del model_nf4
torch.cuda.empty_cache()
print("Experiment A Complete.")

In [None]:
# Experiment B: FP8

if ENABLE_FP8_COMPARISON:
    print(f"\n--- Starting Experiment B: FP8 {CURRENT_MODEL_ID} ---")

    # 1. Load FP8 Model
    print("Loading FP8 Model...")
    model_fp8 = AutoModelForCausalLM.from_pretrained(
        CURRENT_MODEL_ID,
        torch_dtype=torch.float8_e4m3fn,
        device_map="auto"
    )

    # 2. PROFILE: Find layers hurting FP8 performance
    print("Profiling FP8 Sensitivity...")
    fp8_sensitivity = profile_restoration_sensitivity(
        model_q=model_fp8,
        model_ref=model_fp16,
        calib_input=calib_data,
        granularity='block' if ENABLE_BLOCK_WISE else 'layer'
    )

    # 3. Sort layers by Sensitivity
    sorted_fp8 = sorted(fp8_sensitivity.items(), key=lambda x: x[1], reverse=True)
    all_layer_names = [n for n, s in sorted_fp8]

    # 4. Incremental Loop
    sorted_thresholds = sorted(SENSITIVITY_THRESHOLDS)
    current_restored_count = 0

    for threshold in sorted_thresholds:
        print(f"\nTargeting Threshold: {threshold:.0%} kept in FP16")

        target_count = int(len(all_layer_names) * threshold)
        layers_to_fix_now = all_layer_names[current_restored_count : target_count]

        if layers_to_fix_now:
            perform_surgery(model_fp8, layers_to_fix_now, model_fp16)
            current_restored_count = target_count

        # 5. Evaluate
        run = wandb.init(
            project=WANDB_PROJECT_NAME,
            name=f"{CURRENT_MODEL_ID.split('/')[-1]}-FP8-{threshold}",
            config={"model": CURRENT_MODEL_ID, "method": "KLD-FP8", "threshold": threshold},
            reinit=True
        )

        acc, ppl, lat, mem, preds = evaluate_full_suite(
            model_fp8, tokenizer, mmlu_dataset, f"KLD-FP8-{threshold:.0%}"
        )

        flip = calculate_flip_rate(base_preds, preds)

        wandb.log({
            "Accuracy": acc, "Perplexity": ppl, "Latency": lat,
            "Memory": mem, "Flip_Rate": flip, "Threshold": threshold
        })

        results_table.append({
            "Model": CURRENT_MODEL_ID,
            "Method": "KLD-FP8",
            "Threshold": threshold,
            "Acc": acc,
            "Flip": flip,
            "PPL": ppl,
            "Latency": lat,
            "Mem": mem
        })
        run.finish()

    del model_fp8
    torch.cuda.empty_cache()
else:
    print("Experiment B skipped.")

In [None]:
# Experiment C: Aggressive Compression (Mixed 2-bit/FP16)

print(f"\n--- Starting Experiment C: Aggressive Compression ({CURRENT_MODEL_ID}) ---")

# --- Helper Function for 2-bit Simulation ---
def fake_quantize_tensor_rtn(w, bits=2):
    """Simple Round-To-Nearest quantization simulation"""
    qmin, qmax = -(2**(bits-1)), 2**(bits-1) - 1
    scale = w.abs().max() / qmax
    if scale == 0: return w
    w_q = (w / scale).round().clamp(qmin, qmax)
    return w_q * scale

model_aggressive = copy.deepcopy(model_fp16).to(device)

print("Applying simulated 2-bit quantization to ALL layers first...")
for name, module in tqdm(model_aggressive.named_modules()):
    if isinstance(module, nn.Linear):
        module.weight.data = fake_quantize_tensor_rtn(module.weight.data, bits=2)

print("Profiling 2-bit Sensitivity...")
agg_sensitivity = profile_restoration_sensitivity(
    model_q=model_aggressive,
    model_ref=model_fp16,
    calib_input=calib_data,
    granularity='layer'
)

sorted_agg = sorted(agg_sensitivity.items(), key=lambda x: x[1], reverse=True)
all_layer_names = [n for n, s in sorted_agg]

sorted_thresholds = sorted(SENSITIVITY_THRESHOLDS)
current_restored_count = 0

for threshold in sorted_thresholds:
    print(f"\nTargeting Threshold: {threshold:.0%} kept in FP16")

    target_count = int(len(all_layer_names) * threshold)
    layers_to_fix_now = all_layer_names[current_restored_count : target_count]

    if layers_to_fix_now:
        perform_surgery(model_aggressive, layers_to_fix_now, model_fp16)
        current_restored_count = target_count

    run = wandb.init(
        project=WANDB_PROJECT_NAME,
        name=f"{CURRENT_MODEL_ID.split('/')[-1]}-Mixed-2bit-{threshold}",
        config={"model": CURRENT_MODEL_ID, "method": "Mixed-2bit", "threshold": threshold},
        reinit=True
    )

    acc, ppl, lat, mem, preds = evaluate_full_suite(
        model_aggressive, tokenizer, mmlu_dataset, f"Mixed 2-bit-{threshold:.0%}"
    )

    flip = calculate_flip_rate(base_preds, preds)

    wandb.log({
        "Accuracy": acc, "Perplexity": ppl, "Latency": lat,
        "Memory": mem, "Flip_Rate": flip, "Threshold": threshold
    })

    results_table.append({
        "Model": CURRENT_MODEL_ID,
        "Method": "Mixed-2bit",
        "Threshold": threshold,
        "Acc": acc,
        "Flip": flip,
        "PPL": ppl,
        "Latency": lat,
        "Mem": mem
    })
    run.finish()

del model_aggressive
torch.cuda.empty_cache()

In [None]:
# Experiment D: AWQ

print(f"\n--- Starting Experiment D: AWQ ({CURRENT_MODEL_ID}) ---")

try:
    from llmcompressor.modifiers.awq import AWQModifier
    from llmcompressor import oneshot

    print("Running AWQ Oneshot Quantization...")
    ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    calib_data_obj = Dataset.from_dict({"text": [text for text in ds["text"] if len(text) > 0][:128]})

    recipe = [AWQModifier(targets="Linear", scheme="W4A16")]
    oneshot(
        model=CURRENT_MODEL_ID,
        dataset=calib_data_obj,
        recipe=recipe,
        output_dir="./awq_temp",
        num_calibration_samples=128,
        max_seq_length=512,
        save_compressed=True
    )

    model_awq = AutoModelForCausalLM.from_pretrained(
        "./awq_temp", device_map="auto", trust_remote_code=True
    )

    print("Profiling AWQ Sensitivity...")
    awq_sensitivity = profile_restoration_sensitivity(
        model_q=model_awq,
        model_ref=model_fp16,
        calib_input=calib_data,
        granularity='block' if ENABLE_BLOCK_WISE else 'layer'
    )

    sorted_awq = sorted(awq_sensitivity.items(), key=lambda x: x[1], reverse=True)
    all_layer_names = [n for n, s in sorted_awq]

    sorted_thresholds = sorted(SENSITIVITY_THRESHOLDS)
    current_restored_count = 0

    for threshold in sorted_thresholds:
        print(f"\nTargeting Threshold: {threshold:.0%} kept in FP16")

        target_count = int(len(all_layer_names) * threshold)
        layers_to_fix_now = all_layer_names[current_restored_count : target_count]

        if layers_to_fix_now:
            perform_surgery(model_awq, layers_to_fix_now, model_fp16)
            current_restored_count = target_count

        run = wandb.init(
            project=WANDB_PROJECT_NAME,
            name=f"{CURRENT_MODEL_ID.split('/')[-1]}-AWQ-{threshold}",
            config={"model": CURRENT_MODEL_ID, "method": "KLD-AWQ", "threshold": threshold},
            reinit=True
        )

        acc, ppl, lat, mem, preds = evaluate_full_suite(
            model_awq, tokenizer, mmlu_dataset, f"KLD-AWQ-{threshold:.0%}"
        )

        flip = calculate_flip_rate(base_preds, preds)

        wandb.log({
            "Accuracy": acc, "Perplexity": ppl, "Latency": lat,
            "Memory": mem, "Flip_Rate": flip, "Threshold": threshold
        })

        results_table.append({
            "Model": CURRENT_MODEL_ID,
            "Method": "KLD-AWQ",
            "Threshold": threshold,
            "Acc": acc,
            "Flip": flip,
            "PPL": ppl,
            "Latency": lat,
            "Mem": mem
        })
        run.finish()

    shutil.rmtree("./awq_temp")
    del model_awq
    torch.cuda.empty_cache()

except Exception as e:
    print(f"Skipping AWQ: {e}")

In [None]:
# Experiment E: GPTQ

print(f"\n--- Starting Experiment E: GPTQ ({CURRENT_MODEL_ID}) ---")

try:
    from llmcompressor.modifiers.quantization import GPTQModifier
    from llmcompressor import oneshot

    print("Running GPTQ Optimization...")
    ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    calib_data_obj = Dataset.from_dict({"text": [text for text in ds["text"] if len(text) > 0][:128]})

    recipe = [
        GPTQModifier(
            targets="Linear",
            scheme="W4A16",
            ignore=["lm_head"],
            dampening_frac=0.01
        )
    ]

    oneshot(
        model=CURRENT_MODEL_ID,
        dataset=calib_data_obj,
        recipe=recipe,
        output_dir="./gptq_temp",
        num_calibration_samples=128,
        max_seq_length=512,
        save_compressed=True
    )

    model_gptq = AutoModelForCausalLM.from_pretrained(
        "./gptq_temp", device_map="auto", trust_remote_code=True
    )

    print("Profiling GPTQ Sensitivity...")
    gptq_sensitivity = profile_restoration_sensitivity(
        model_q=model_gptq,
        model_ref=model_fp16,
        calib_input=calib_data,
        granularity='block' if ENABLE_BLOCK_WISE else 'layer'
    )

    sorted_gptq = sorted(gptq_sensitivity.items(), key=lambda x: x[1], reverse=True)
    all_layer_names = [n for n, s in sorted_gptq]

    sorted_thresholds = sorted(SENSITIVITY_THRESHOLDS)
    current_restored_count = 0

    for threshold in sorted_thresholds:
        print(f"\nTargeting Threshold: {threshold:.0%} kept in FP16")

        target_count = int(len(all_layer_names) * threshold)
        layers_to_fix_now = all_layer_names[current_restored_count : target_count]

        if layers_to_fix_now:
            perform_surgery(model_gptq, layers_to_fix_now, model_fp16)
            current_restored_count = target_count

        run = wandb.init(
            project=WANDB_PROJECT_NAME,
            name=f"{CURRENT_MODEL_ID.split('/')[-1]}-GPTQ-{threshold}",
            config={"model": CURRENT_MODEL_ID, "method": "KLD-GPTQ", "threshold": threshold},
            reinit=True
        )

        acc, ppl, lat, mem, preds = evaluate_full_suite(
            model_gptq, tokenizer, mmlu_dataset, f"KLD-GPTQ-{threshold:.0%}"
        )

        flip = calculate_flip_rate(base_preds, preds)

        wandb.log({
            "Accuracy": acc, "Perplexity": ppl, "Latency": lat,
            "Memory": mem, "Flip_Rate": flip, "Threshold": threshold
        })

        results_table.append({
            "Model": CURRENT_MODEL_ID,
            "Method": "KLD-GPTQ",
            "Threshold": threshold,
            "Acc": acc,
            "Flip": flip,
            "PPL": ppl,
            "Latency": lat,
            "Mem": mem
        })
        run.finish()

    shutil.rmtree("./gptq_temp")
    del model_gptq
    torch.cuda.empty_cache()

except Exception as e:
    print(f"Skipping GPTQ: {e}")

print("All Experiments Complete.")

# Visualization & Reporting

QWen:
```

                        Model         Method  Threshold    Acc   Flip  \
0  Qwen/Qwen2.5-0.5B-Instruct  FP16 Baseline       0.00  0.250  0.000   
1  Qwen/Qwen2.5-0.5B-Instruct        KLD-NF4       0.00  0.265  0.630   
2  Qwen/Qwen2.5-0.5B-Instruct        KLD-NF4       0.01  0.265  0.630   
3  Qwen/Qwen2.5-0.5B-Instruct        KLD-NF4       0.05  0.280  0.430   
4  Qwen/Qwen2.5-0.5B-Instruct        KLD-NF4       0.10  0.280  0.395   
5  Qwen/Qwen2.5-0.5B-Instruct        KLD-NF4       0.20  0.290  0.305   
6  Qwen/Qwen2.5-0.5B-Instruct       KLD-Int8       0.05  0.250  0.155   
7  Qwen/Qwen2.5-0.5B-Instruct     Mixed-2bit       0.05  0.225  0.465   
8  Qwen/Qwen2.5-0.5B-Instruct            AWQ       0.00  0.240  0.430   
9  Qwen/Qwen2.5-0.5B-Instruct           GPTQ       0.00  0.205  0.385   

             PPL   Latency       Mem  
0       8.350414  2.043066  0.942955  
1       8.855442  2.643480  0.458393  
2       8.855442  2.512811  0.711964  
3       8.830623  2.543613  0.949767  
4       8.774139  2.509315  1.014822  
5       8.657179  2.484945  1.139766  
6       8.353897  7.321452  1.071951  
7  537496.375000  1.837433  1.131078  
8      12.012968  5.784297  3.139405  
9       9.055728  4.997509  1.655096

```



In [None]:
# View Results Table
df = pd.DataFrame(results_table)
print(df)

In [None]:
# Save Results to Google Drive
from google.colab import drive
import pandas as pd
import os

# 1. Mount Google Drive
# This will trigger a popup asking for permission
drive.mount('/content/drive')

In [None]:
# 2. Define Filename
# We include the model name to avoid overwriting previous results
model_name = CURRENT_MODEL_ID.split('/')[-1] if 'CURRENT_MODEL_ID' in globals() else "experiment"
filename = f"//content/drive/MyDrive/Columbia-LLMSeminar/SLLM project/Mena/{model_name}_1204results.csv"

# 3. Create Directory if it doesn't exist
os.makedirs(os.path.dirname(filename), exist_ok=True)

# 4. Save
if results_table:
    df = pd.DataFrame(results_table)
    df.to_csv(filename, index=False)
    print(f"✅ Success! Results saved to Google Drive at:\n{filename}")
else:
    print("⚠️ Warning: results_table is empty. Nothing to save.")

In [None]:
# Log Final Summary to WandB

print("\n--- Uploading Final Report to Weights & Biases ---")

# 1. Initialize a generic "Summary" run
run = wandb.init(
    project=WANDB_PROJECT_NAME,
    name="Final-Summary-Report",
    job_type="report"
)

# 2. Upload the Master Data Table
# This allows you to query/sort your results in the WandB UI
if results_table:
    tbl = wandb.Table(dataframe=pd.DataFrame(results_table))
    wandb.log({"Experiment_Results_Raw": tbl})

# 3. Upload the Matplotlib Images (from Cell 7)
# This saves the static PNGs you just generated into the cloud
import os
image_files = ['graph1_sweet_spot.png', 'graph2_flip_rate.png', 'graph3_efficiency.png']

for img_file in image_files:
    if os.path.exists(img_file):
        wandb.log({img_file.replace(".png", ""): wandb.Image(img_file)})
        print(f"Uploaded {img_file}")
    else:
        print(f"Warning: {img_file} not found. Did you run Cell 7?")

# 4. Create an Interactive Custom Chart (Efficiency Frontier)
# This creates a native WandB chart where you can hover over dots to see model details
if results_table:
    data = [[r['Method'], r['Threshold'], r['Mem'], r['PPL']] for r in results_table]
    table = wandb.Table(data=data, columns=["Method", "Threshold", "Memory", "Perplexity"])

    # Custom Scatter Plot definition
    wandb.log({
        "Efficiency_Frontier_Interactive": wandb.plot.scatter(
            table, "Memory", "Perplexity", title="Efficiency Frontier (Interactive)"
        )
    })

run.finish()
print("Upload Complete. Check your WandB Dashboard!")

In [None]:
# Visualization & Reporting
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Prepare Data
if not results_table:
    print("No results to plot! Run the experiments first.")
else:
    df = pd.DataFrame(results_table)

    # Optional: Save raw data to CSV for your paper
    df.to_csv("experiment_results.csv", index=False)
    print("Results saved to 'experiment_results.csv'")

    # Filter for the model we just tested (or select the first one available)
    target_model = CURRENT_MODEL_ID if 'CURRENT_MODEL_ID' in globals() else df['Model'].unique()[0]
    model_df = df[df['Model'] == target_model]

    print(f"\nGenerating Plots for: {target_model}")

    # Set Style
    sns.set_theme(style="whitegrid")
    plt.rcParams.update({'font.size': 12})

    # ==============================================================================
    # Graph 1: The "Sweet Spot" (Accuracy vs. Threshold)
    # Goal: Show that 5% FP16 recovery beats the Standard NF4 baseline
    # ==============================================================================
    plt.figure(figsize=(10, 6))

    # Filter for KLD-NF4 data points
    nf4_data = model_df[model_df['Method'] == 'KLD-NF4'].sort_values('Threshold')

    # Plot the KLD Curve
    sns.lineplot(data=nf4_data, x='Threshold', y='Acc', marker='o', label='KLD-Guided NF4', linewidth=2.5)

    # Plot Baseline Reference (FP16)
    baseline_acc = model_df[model_df['Method'] == 'FP16 Baseline']['Acc'].values[0]
    plt.axhline(y=baseline_acc, color='green', linestyle='--', label=f'FP16 Baseline ({baseline_acc:.1%})')

    # Formatting
    plt.title(f'The Sweet Spot: Accuracy vs. FP16 Retention ({target_model})', fontsize=14)
    plt.xlabel('Percentage of Layers Kept in FP16', fontsize=12)
    plt.ylabel('MMLU Accuracy', fontsize=12)
    plt.legend()

    # Fix X-Axis to show percentages nicely
    plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:.0%}'))
    plt.tight_layout()
    plt.savefig('graph1_sweet_spot.png', dpi=300)
    plt.show()

    # ==============================================================================
    # Graph 2: Flip Rate Reduction (The "Stability" Metric)
    # Goal: Show that KLD-Guided significantly reduces answer flips compared to 0%
    # ==============================================================================
    plt.figure(figsize=(10, 6))

    # Plot Flip Rate Curve
    sns.barplot(data=nf4_data, x='Threshold', y='Flip', hue='Threshold', palette="viridis", legend=False)

    # Formatting
    plt.title(f'Impact of KLD Guidance on Output Stability', fontsize=14)
    plt.xlabel('Percentage of Layers Kept in FP16', fontsize=12)
    plt.ylabel('Flip Rate (Lower is Better)', fontsize=12)

    # Fix Y-Axis to percentage
    plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y:.0%}'))

    plt.tight_layout()
    plt.savefig('graph2_flip_rate.png', dpi=300)
    plt.show()

    # ==============================================================================
    # Graph 3: Efficiency Frontier (Perplexity vs. Memory)
    # Goal: Compare ALL methods (NF4, Int8, AWQ, Mixed-2bit)
    # Ideal Position: Bottom-Left Corner (Low Memory, Low Perplexity)
    # ==============================================================================
    plt.figure(figsize=(10, 7))

    # Create Scatter Plot
    # We remove the Baseline from this plot if it skews the scale too much,
    # but usually it's good to keep it to show the Memory gap.
    sns.scatterplot(
        data=model_df,
        x='Mem',
        y='PPL',
        hue='Method',
        style='Method',
        s=200, # Marker size
        alpha=0.8
    )

    # Annotate points
    for i in range(model_df.shape[0]):
        row = model_df.iloc[i]
        plt.text(
            row.Mem + 0.02,
            row.PPL + 0.02,
            f"{row.Method}\n({row.Threshold:.0%})",
            fontsize=9
        )

    # Formatting
    plt.title(f'Efficiency Frontier: Memory vs. Perplexity', fontsize=14)
    plt.xlabel('Memory Usage (GB)', fontsize=12)
    plt.ylabel('Perplexity (Lower is Better)', fontsize=12)
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

    plt.tight_layout()
    plt.savefig('graph3_efficiency.png', dpi=300)
    plt.show()

    # ==============================================================================
    # Final Data Table
    # ==============================================================================
    print("\n=== Final Master Results Table ===")
    # Reorder columns for readability
    cols = ['Model', 'Method', 'Threshold', 'Acc', 'Flip', 'PPL', 'Mem']
    display_df = model_df[cols].sort_values(['Method', 'Threshold'])
    print(display_df.to_string(index=False))