In [2]:
pip install datasets sentencepiece

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
import time
import torch
import numpy as np
from sklearn.cluster import KMeans
from torch.utils.data import DataLoader, TensorDataset
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,
    OPTForSequenceClassification,
    AutoTokenizer,
)
from datasets import load_dataset

# Constants
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_CLUSTERS = 16
MAX_SAMPLES = 1000
SENSITIVE_LAYER_PERCENTAGE = 0.3
EPOCHS = 3
BATCH_SIZE = 4  # Adjusted for large models
LEARNING_RATE = 1e-5

# Model Configurations
MODELS = {
    "llama-7b": {
        "model_name": "meta-llama/Llama-7B",
        "tokenizer_name": "meta-llama/Llama-7B",
        "model_class": LlamaForSequenceClassification,
        "tokenizer_class": LlamaTokenizer,
    },
    "llama-33b": {
        "model_name": "meta-llama/Llama-33B",
        "tokenizer_name": "meta-llama/Llama-33B",
        "model_class": LlamaForSequenceClassification,
        "tokenizer_class": LlamaTokenizer,
    },
    "opt-2.7b": {
        "model_name": "facebook/opt-2.7b",
        "tokenizer_name": "facebook/opt-2.7b",
        "model_class": OPTForSequenceClassification,
        "tokenizer_class": AutoTokenizer,
    },
}


# Preprocessing Data
def preprocess_data(dataset, max_samples=100, max_length=512):
    inputs = []
    labels = []

    for i, example in enumerate(dataset):
        if i >= max_samples:
            break

        # Extract text input based on dataset structure
        if "question" in example:  # BoolQ-like datasets
            text = f"Question: {example['question']} Context: {example.get('context', example.get('passage', ''))}"
        elif "ctx" in example and "endings" in example:  # For HellaSwag
            text = f"Context: {example['ctx']} Ending: {example['endings'][0]}"  # Using the first ending
        else:
            raise ValueError("Unsupported dataset format or missing keys.")

        # Extract the label dynamically
        if "answer" in example:  # BoolQ-like datasets
            label = int(example["answer"])  # Convert boolean to integer (True=1, False=0)
        elif "label" in example:
            label = example["label"]
        elif "gold_label" in example:
            label = example["gold_label"]
        else:
            label = None  # Default if no valid label is found

        if label is None:
            print(f"Skipping example due to missing label: {example}")
            continue  # Skip this example

        inputs.append(text)
        labels.append(label)

    tokenized_inputs = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt", max_length=max_length)
    return tokenized_inputs, torch.tensor(labels, dtype=torch.long)


# Calculating Layer Sensitivities
def calculate_layer_sensitivities(model, is_llama):
    sensitivities = []
    layers = model.model.layers if is_llama else model.model.decoder.layers
    for layer in layers:
        key_weights = layer.self_attn.k_proj.weight.detach().cpu().numpy()
        value_weights = layer.self_attn.v_proj.weight.detach().cpu().numpy()
        sensitivity = np.var(key_weights) + np.var(value_weights)
        sensitivities.append(sensitivity)
    return sensitivities


# Identifying Top Sensitive Layers
def get_top_sensitive_layers(sensitivities, percentage):
    num_sensitive_layers = int(len(sensitivities) * percentage)
    top_layers = np.argsort(sensitivities)[-num_sensitive_layers:]
    return sorted(top_layers)


# Clustering Layer Weights
def cluster_layers(model, num_clusters, is_llama):
    layers = model.model.layers if is_llama else model.model.decoder.layers
    for layer in layers:
        for proj_name in ["k_proj", "v_proj"]:
            proj = getattr(layer.self_attn, proj_name)
            weights = proj.weight.detach().cpu().numpy()
            original_shape = weights.shape
            flattened_weights = weights.reshape(-1, 1)

            # Apply KMeans clustering
            kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(flattened_weights)
            clustered_weights = kmeans.cluster_centers_[kmeans.labels_]
            clustered_weights = clustered_weights.reshape(original_shape)

            # Update weights with clustered weights
            with torch.no_grad():
                proj.weight.copy_(torch.tensor(clustered_weights, device=DEVICE))


# Fine-Tuning Sensitive Layers
def fine_tune_model(model, dataloader, sensitive_layers, is_llama):
    # Freeze all layers except the sensitive ones
    layer_prefix = "model.layers" if is_llama else "model.decoder.layers"
    for name, param in model.named_parameters():
        param.requires_grad = any(f"{layer_prefix}.{i}." in name for i in sensitive_layers)

    optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LEARNING_RATE)
    criterion = torch.nn.CrossEntropyLoss()

    model.train()
    for epoch in range(EPOCHS):
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            inputs = {'input_ids': input_ids.to(DEVICE), 'attention_mask': attention_mask.to(DEVICE)}
            labels = labels.to(DEVICE)

            optimizer.zero_grad()
            with torch.cuda.amp.autocast():  # Mixed precision training
                outputs = model(**inputs)
                logits = outputs.logits
                loss = criterion(logits, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()


# Evaluating the Model
def evaluate_model(model, dataloader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            inputs = {'input_ids': input_ids.to(DEVICE), 'attention_mask': attention_mask.to(DEVICE)}
            outputs = model(**inputs)

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)

            correct += (predictions == labels.to(DEVICE)).sum().item()
            total += labels.size(0)
    return correct / total


# Create Dataloader
def create_dataloader(inputs, labels, batch_size):
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    dataset = TensorDataset(input_ids, attention_mask, labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)


# Main Workflow
def main_large_models(model_key, dataset, dataset_name):
    global tokenizer
    model_config = MODELS[model_key]
    tokenizer = model_config["tokenizer_class"].from_pretrained(model_config["tokenizer_name"])
    model = model_config["model_class"].from_pretrained(model_config["model_name"], num_labels=2).to(DEVICE)
    model.gradient_checkpointing_enable()

    tokenized_inputs, labels = preprocess_data(dataset, MAX_SAMPLES)
    dataloader = create_dataloader(tokenized_inputs, labels, BATCH_SIZE)

    is_llama = "llama" in model_key
    sensitivities = calculate_layer_sensitivities(model, is_llama)
    sensitive_layers = get_top_sensitive_layers(sensitivities, SENSITIVE_LAYER_PERCENTAGE)

    print(f"Sensitive Layers for {dataset_name} ({model_key}):", sensitive_layers)

    cluster_layers(model, NUM_CLUSTERS, is_llama)

    start_time = time.time()
    accuracy_before = evaluate_model(model, dataloader)
    print(f"Accuracy Before Fine-Tuning on {dataset_name} ({model_key}): {accuracy_before}")

    fine_tune_model(model, dataloader, sensitive_layers, is_llama)

    accuracy_after = evaluate_model(model, dataloader)
    end_time = time.time()
    print(f"Accuracy After Fine-Tuning on {dataset_name} ({model_key}): {accuracy_after}")
    print(f"Accuracy Drop on {dataset_name} ({model_key}): {accuracy_before - accuracy_after}")
    print(f"Total Time for {dataset_name} ({model_key}): {end_time - start_time} seconds")


# Example Dataset Placeholder
for model_key in MODELS.keys():
    dataset_name = "boolq"
    dataset = load_dataset("super_glue", dataset_name, split="validation")
    main_large_models(model_key, dataset, dataset_name)


OSError: meta-llama/Llama-7B is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

benchmark

In [5]:
import time
import torch
import numpy as np
from tqdm import tqdm
from sklearn.cluster import KMeans
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from sklearn.metrics import accuracy_score

# Paths and parameters
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
NUM_CLUSTERS = 2000
MAX_SAMPLES = 100

# Supported models and their configurations
MODEL_CONFIGS = {
    "opt-2.7b": {
        "model_name": "facebook/opt-2.7b",
        "tokenizer_name": "facebook/opt-66b"
    },
    "llama-7b": {
        "model_name": "meta-llama/LLaMA-7B",
        "tokenizer_name": "meta-llama/LLaMA-7B"
    },
    "llama-33b": {
        "model_name": "meta-llama/LLaMA-33B",
        "tokenizer_name": "meta-llama/LLaMA-33B"
    }
}

def calculate_kv_cache_size_final(model, num_clusters):
    """
    Calculate the total size of K and V projection weights for the clustered model.
    """
    total_size = 0
    num_layers = model.config.num_hidden_layers
    head_dim = model.config.hidden_size // model.config.num_attention_heads  # Per-head dimension

    for _ in range(num_layers):
        # Each cluster contributes one set of weights for keys and values
        k_proj_size = num_clusters * head_dim
        v_proj_size = num_clusters * head_dim
        total_size += k_proj_size + v_proj_size

    return total_size * 2

def preprocess_data(dataset, dataset_name, max_samples=100, max_length=512):
    inputs, labels = [], []
    for i, example in enumerate(dataset):
        if i >= max_samples:
            break
        try:
            if dataset_name == "hellaswag":
                text = f"Context: {example['ctx']} Ending: {example['endings'][0]}"
                label = int(example['label'])
            elif dataset_name == "piqa":
                text = f"Question: {example['goal']} Choice: {example['sol1']}"
                label = int(example['label'])
            else:
                raise ValueError(f"Unsupported dataset: {dataset_name}")
            inputs.append(text)
            labels.append(label)
        except KeyError as e:
            print(f"Skipping example due to missing key: {e}")
        except ValueError as e:
            print(f"Skipping example due to invalid label: {e}")
    if len(inputs) == 0 or len(labels) == 0:
        raise ValueError(f"No valid examples found in dataset: {dataset_name}")
    tokenized_inputs = tokenizer(
        inputs, return_tensors="pt", padding=True, truncation=True, max_length=max_length
    )
    return tokenized_inputs, torch.tensor(labels, dtype=torch.long)



def calculate_kv_cache_size(model):
    """
    Calculates the total size of the key-value (KV) cache for the model.
    """
    kv_cache_size = 0
    for layer in model.model.decoder.layers:
        # Key and Value weights
        key_size = layer.self_attn.k_proj.weight.numel()
        value_size = layer.self_attn.v_proj.weight.numel()
        kv_cache_size += key_size + value_size
    return kv_cache_size

def cluster_attention_heads(model, num_clusters):
    """
    Clusters the key and value projection weights of all attention heads in all layers.
    """
    for layer_idx, layer in enumerate(model.model.decoder.layers):
        # Extract key and value weights
        k_weights = layer.self_attn.k_proj.weight.data
        v_weights = layer.self_attn.v_proj.weight.data

        # Reshape to (num_heads, head_dim, hidden_size)
        num_heads = model.config.num_attention_heads
        head_dim = model.config.hidden_size // num_heads

        k_weights = k_weights.view(num_heads, head_dim, -1).cpu().numpy()
        v_weights = v_weights.view(num_heads, head_dim, -1).cpu().numpy()

        # Flatten for clustering
        k_flat = k_weights.reshape(num_heads, -1)
        v_flat = v_weights.reshape(num_heads, -1)

        # Adjust number of clusters to be <= num_heads
        adjusted_clusters = min(num_heads, num_clusters)
        print(f"Layer {layer_idx}: Adjusting clusters to {adjusted_clusters} (num_heads={num_heads})")

        # Perform clustering
        kmeans_k = KMeans(n_clusters=adjusted_clusters, random_state=0).fit(k_flat)
        kmeans_v = KMeans(n_clusters=adjusted_clusters, random_state=0).fit(v_flat)

        # Replace with centroids
        k_clustered = torch.tensor(kmeans_k.cluster_centers_, device=DEVICE)
        v_clustered = torch.tensor(kmeans_v.cluster_centers_, device=DEVICE)

        # Reshape and update weights
        layer.self_attn.k_proj.weight.data = k_clustered.view_as(layer.self_attn.k_proj.weight.data)
        layer.self_attn.v_proj.weight.data = v_clustered.view_as(layer.self_attn.v_proj.weight.data)

    print("Clustering of attention heads completed.")

def evaluate_model(model, tokenized_inputs):
    """
    Evaluates the model.
    """
    input_ids = tokenized_inputs["input_ids"].to(DEVICE)
    attention_mask = tokenized_inputs["attention_mask"].to(DEVICE)

    start_time = time.time()

    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)

    end_time = time.time()
    total_time = end_time - start_time
    print(f"Evaluation time: {total_time:.2f} seconds")
    return total_time

def process_model(model_name, dataset_name):
    """
    Loads, clusters, and evaluates the model on a dataset.
    """
    model_config = MODEL_CONFIGS[model_name]
    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_config["tokenizer_name"])
    model = AutoModelForCausalLM.from_pretrained(model_config["model_name"]).to(DEVICE)

    dataset = load_dataset(dataset_name, split="validation")
    tokenized_inputs = preprocess_data(dataset,dataset_name)

    # Initial KV cache size
    initial_kv_cache_size = calculate_kv_cache_size(model)
    print(f"Initial KV cache size: {initial_kv_cache_size} elements")

    # Cluster attention heads
    cluster_attention_heads(model, NUM_CLUSTERS)

    # Final KV cache size
    final_kv_cache_size = calculate_kv_cache_size_final(model, NUM_CLUSTERS)
    print(f"Final KV cache size: {final_kv_cache_size} elements")

    # KV cache reduction
    kv_cache_reduction_percentage = ((initial_kv_cache_size - final_kv_cache_size) / initial_kv_cache_size) * 100
    print(f"KV cache reduction percentage: {kv_cache_reduction_percentage:.2f}%")

    # Evaluate model
    eval_time = evaluate_model(model, tokenized_inputs)
    print(f"Evaluation time: {eval_time:.2f} seconds")

def main():
    datasets = ["hellaswag"]
    models = ["opt-2.7b"] #"llama-33b"]
    for model_name in models:
        for dataset_name in datasets:
            print(f"Processing model {model_name} on dataset {dataset_name}")
            process_model(model_name, dataset_name)

if __name__ == "__main__":
    main()


Processing model opt-2.7b on dataset hellaswag


TypeError: preprocess_data() missing 1 required positional argument: 'dataset_name'

In [None]:
a=5