# KVShuttle: FP16 Generation Quality on GPU

## Setup
1. Create zip locally: `cd KVShuttle && zip -r kvshuttle.zip . -x '.git/*'`
2. Set runtime to GPU (T4 or A100)
3. Run all cells — upload zip when prompted
4. For Llama: paste your HF token and accept license first

In [None]:
# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")
else:
    raise RuntimeError("No GPU detected! Go to Runtime > Change runtime type > GPU")

In [None]:
# Install dependencies + upload KVShuttle
!pip install -q transformers accelerate datasets pyyaml tqdm

import os
if not os.path.exists("kvshuttle"):
    from google.colab import files
    print("Upload kvshuttle.zip")
    uploaded = files.upload()
    !unzip -qo kvshuttle.zip -d KVShuttle
    %cd KVShuttle
    !pip install -q -e .
else:
    print("KVShuttle already installed")

In [None]:
# HuggingFace login (required for gated models like Llama)
# Paste your token from https://huggingface.co/settings/tokens
from huggingface_hub import login
login(token="PASTE_YOUR_HF_TOKEN_HERE")

In [None]:
# Verify installation
from kvshuttle.models.loader_torch import TORCH_MODEL_REGISTRY
from kvshuttle.compression.registry import list_compressors
print(f"Models: {list(TORCH_MODEL_REGISTRY.keys())}")
print(f"Compressors: {list_compressors()}")

In [None]:
# Write config inline (avoids zip caching issues)
import yaml
from pathlib import Path

config = {
    "experiment": {"name": "generation_quality_fp16", "description": "Mistral-7B + Qwen2.5-7B FP16 on A100 GPU"},
    "backend": "torch",
    "models": ["mistral-7b", "qwen2.5-7b"],
    "compressors": ["identity", "uniform_int8", "uniform_int4", "kivi_2bit", "cachegen", "palu_lr", "cascade_prune50_int4"],
    "bandwidths_gbps": [10],
    "prompts": {"source": "wikitext", "count": 50, "min_tokens": 128, "max_tokens": 512},
    "evaluation": {"attention_error": True, "perplexity": True, "token_agreement": True},
    "output": {"dir": "experiments/results/generation_quality_fp16_7b", "save_per_layer": False},
}

config_path = Path("experiments/configs/generation_quality_torch_7b.yaml")
config_path.parent.mkdir(parents=True, exist_ok=True)
with open(config_path, "w") as f:
    yaml.dump(config, f)
print(f"Wrote config to {config_path}")

!python -m experiments.scripts.run_experiment {config_path}

In [None]:
# Inspect results
import json
from pathlib import Path

results_dirs = [
    Path("experiments/results/generation_quality_fp16_7b"),
    Path("experiments/results/generation_quality_fp16_llama"),
    Path("experiments/results/generation_quality_fp16_smoke"),
    Path("experiments/results/generation_quality_fp16"),
]
results_path = next((d / "results.json" for d in results_dirs if (d / "results.json").exists()), None)

if results_path:
    with open(results_path) as f:
        data = json.load(f)
    print(f"Results: {len(data['results'])} from {results_path}")

    import pandas as pd
    df = pd.DataFrame(data['results'])
    agg_cols = {c: 'mean' for c in ['mean_key_cosine_sim', 'perplexity_delta', 'token_agreement'] if c in df.columns}
    if agg_cols:
        display(df.groupby(['model', 'compressor']).agg(agg_cols).round(4))
    else:
        print("No quality metrics — check WARNING logs above")
else:
    print("Results not found. Check experiment output above for errors.")

In [None]:
# Download results
if results_path and results_path.exists():
    from google.colab import files
    files.download(str(results_path))