# KVShuttle: FP16 Generation Quality on GPU

This notebook runs the end-to-end generation quality experiment using
FP16 PyTorch models on a CUDA GPU (T4/A100).

**Smoke test mode:** qwen2.5-3b only, 5 prompts (~5-10 min on T4).
Change `CONFIG` below to run the full experiment.

## Setup
Create the zip locally first:
```bash
cd /path/to/KVShuttle
zip -r kvshuttle.zip . -x '.git/*'
```
Then upload when prompted in the install cell below.

In [None]:
# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")
else:
    raise RuntimeError("No GPU detected! Go to Runtime > Change runtime type > GPU")

In [None]:
# Install dependencies
!pip install -q transformers accelerate datasets pyyaml tqdm

# Upload kvshuttle.zip (created locally via: cd KVShuttle && zip -r kvshuttle.zip . -x '.git/*')
import os
if not os.path.exists("kvshuttle"):
    from google.colab import files
    print("Upload kvshuttle.zip (see instructions in markdown above)")
    uploaded = files.upload()  # upload kvshuttle.zip
    !unzip -qo kvshuttle.zip -d KVShuttle
    %cd KVShuttle
    !pip install -q -e .
else:
    print("KVShuttle already installed")

In [None]:
# Verify KVShuttle installation and torch backend
from kvshuttle.models.loader_torch import TORCH_MODEL_REGISTRY, load_model_torch
from kvshuttle.models.kv_extractor_torch import extract_kv_cache_torch
from kvshuttle.models.kv_injector_torch import forward_continuation_with_kv_cache_torch
from kvshuttle.compression.registry import list_compressors

print(f"Available models: {list(TORCH_MODEL_REGISTRY.keys())}")
print(f"Available compressors: {list_compressors()}")

## Run the experiment

Set `CONFIG` to choose between smoke test (quick) and full run.

In [None]:
# Choose config: smoke test (qwen2.5-3b, 5 prompts) or full (3 models, 50 prompts)
# CONFIG = "experiments/configs/generation_quality_torch_smoke.yaml"  # ~5-10 min on T4
CONFIG = "experiments/configs/generation_quality_torch.yaml"          # ~2-3 hrs on T4

!python -m experiments.scripts.run_experiment {CONFIG}

In [None]:
# Inspect results
import json
from pathlib import Path

# Auto-detect results dir from config
results_dirs = [
    Path("experiments/results/generation_quality_fp16_smoke"),
    Path("experiments/results/generation_quality_fp16"),
]
results_path = next((d / "results.json" for d in results_dirs if (d / "results.json").exists()), None)

if results_path:
    with open(results_path) as f:
        data = json.load(f)
    print(f"Metadata: {json.dumps(data['metadata'], indent=2)}")
    print(f"\nTotal results: {len(data['results'])}")
    
    import pandas as pd
    df = pd.DataFrame(data['results'])
    print(f"\nAvailable columns: {list(df.columns)}")
    
    # Aggregate only columns that exist
    agg_cols = {}
    for col in ['mean_key_cosine_sim', 'perplexity_delta', 'token_agreement']:
        if col in df.columns:
            agg_cols[col] = 'mean'
    
    if agg_cols:
        summary = df.groupby(['model', 'compressor']).agg(agg_cols).round(4)
        display(summary)
    else:
        print("\nNo quality metrics found â€” check WARNING logs above for errors")
else:
    print("Results not found. Check experiment output above for errors.")

In [None]:
# Download results for local figure generation
if results_path and results_path.exists():
    from google.colab import files
    files.download(str(results_path))
    print(f"Downloaded {results_path}")