In [1]:
from flask import Flask, request, jsonify, render_template
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import os
from transformers import OPTForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset
from sklearn.cluster import KMeans
from kneed import KneeLocator
import time
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_opt_classifier(model_name):
    """Load the specified OPT model and tokenizer."""
    model = OPTForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

def get_model_size(model, path="temp_model.pth"):
    """Calculate model size in MB."""
    torch.save(model.state_dict(), path)
    size_mb = os.path.getsize(path) / (1024 * 1024)
    os.remove(path)
    return size_mb
def enforce_head_constraint(num_heads, embed_dim):
    """ Adjusts number of heads to ensure divisibility with embedding dimension. """
    while embed_dim % num_heads != 0:
        num_heads -= 1
    return num_heads

def load_opt_classifier():
    model = OPTForSequenceClassification.from_pretrained("facebook/opt-350m", num_labels=2)
    tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
    return model, tokenizer
def get_attention_scores(model, input_ids):
    """ Extracts attention scores from the model while ensuring correct dimensions. """
    attention_scores = {}
    input_ids = input_ids.to(device)  #  Move input IDs to GPU

    with torch.no_grad():
        outputs = model(input_ids, output_attentions=True)

        for layer_idx, attn in enumerate(outputs.attentions):
            attn = attn.cpu().numpy()  #  Move to CPU
            attn = np.mean(attn, axis=(0, 2, 3)) if attn.ndim == 4 else np.mean(attn, axis=0)
            attention_scores[layer_idx] = attn  #  Store correctly processed attention scores

    return attention_scores




def cluster_heads(attention_scores, num_clusters):
    """ Clusters attention heads while ensuring correct shape. """
    num_heads = len(attention_scores)

    if num_heads <= 10:
        return list(range(num_heads))

    attention_scores = np.array(attention_scores).reshape(-1, 1)  #  Flatten for clustering

    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init="auto")
    kmeans.fit(attention_scores)

    labels = kmeans.labels_
    cluster_representatives = []

    for cluster_idx in range(num_clusters):
        indices = np.where(labels == cluster_idx)[0]
        if len(indices) > 0:
            keep_count = max(1, len(indices) * 5 // 10)  #  Pruning 50% of heads per cluster
            cluster_representatives.extend(indices[:keep_count])

    return sorted(cluster_representatives)

def prune_attention_heads(model, clustered_heads):
    """ Prunes attention heads while ensuring correct embedding dimensions. """
    for layer_idx, heads_to_keep in enumerate(clustered_heads):
        attn_layer = model.model.decoder.layers[layer_idx].self_attn

        #  Ensure valid number of heads per layer
        original_num_heads = attn_layer.num_heads
        new_num_heads = enforce_head_constraint(len(heads_to_keep), attn_layer.embed_dim)

        #  Update number of heads
        attn_layer.num_heads = new_num_heads

        #  Ensure Q, K, V projections match new number of heads
        head_dim = attn_layer.embed_dim // original_num_heads
        new_embed_dim = new_num_heads * head_dim

        attn_layer.q_proj = nn.Linear(attn_layer.embed_dim, new_embed_dim, bias=False)
        attn_layer.k_proj = nn.Linear(attn_layer.embed_dim, new_embed_dim, bias=False)
        attn_layer.v_proj = nn.Linear(attn_layer.embed_dim, new_embed_dim, bias=False)

        #  Ensure output projection layer matches new size
        attn_layer.out_proj = nn.Linear(new_embed_dim, attn_layer.embed_dim, bias=False)

    return model

def divide_layers_by_sensitivity(sensitivities):
    """ Splits layers into 3 groups (High, Medium, Low) based on sensitivity scores. """
    sorted_layers = sorted(sensitivities, key=sensitivities.get, reverse=True)
    num_layers = len(sorted_layers)
    high = sorted_layers[: int(num_layers * 0.2)]
    medium = sorted_layers[int(num_layers * 0.2) : int(num_layers * 0.7)]
    low = sorted_layers[int(num_layers * 0.7) :]
    return high, medium, low

def apply_mixed_precision(model, medium, low):
    """ Applies mixed precision quantization to the model. """
    for layer_idx in medium + low:
        for param in model.model.decoder.layers[layer_idx].parameters():
            param.data = param.data.half()
    model.half()
    return model

def compute_sensitivity(attention_scores):
    # Debugging: Print attention scores
    print("Raw Attention Scores:", attention_scores)

    # Compute absolute mean for each layer
    cleaned_scores = {
        layer_idx: np.mean(np.abs(np.array(scores, dtype=np.float32)))
        for layer_idx, scores in attention_scores.items()
        if isinstance(scores, (list, np.ndarray)) and len(scores) > 0  # Ensure non-empty values
    }
    
    # Debugging: Print computed sensitivities
    print("Computed Sensitivities:", cleaned_scores)
    
    return cleaned_scores# -------------------- Evaluation -------------------- #
# def evaluate_model(model, tokenizer):
#     """ Evaluates model accuracy on PIQA dataset (multiple-choice task). """
#     dataset = load_dataset("piqa", split="validation[:100]")
#     model.eval()
#     correct, total = 0, 0

#     with torch.no_grad():
#         for example in dataset:
#             prompt = example["goal"]
#             choices = [example["sol1"], example["sol2"]]
#             inputs = tokenizer([prompt + " " + choice for choice in choices], return_tensors="pt", padding=True, truncation=True)
#             outputs = model(**inputs)
#             logits = outputs.logits.squeeze()
#             predicted_choice = torch.argmax(logits).item()
#             correct += (predicted_choice == example["label"])
#             total += 1

#     return (correct / total) * 100

# -------------------- Main Execution -------------------- #
def get_optimal_clusters(attention_scores):
    """ Determines optimal clusters for attention heads using the Elbow Method. """
    num_heads = len(attention_scores)
    if num_heads <= 10:
        return num_heads
    max_clusters = max(num_heads - int(0.2 * num_heads), num_heads * 4 // 5)
    errors = []
    cluster_range = range(1, max_clusters + 1)
    for num_clusters in cluster_range:
        kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init="auto")
        kmeans.fit(attention_scores.reshape(-1, 1))
        errors.append(kmeans.inertia_)
    elbow = KneeLocator(cluster_range, errors, curve="convex", direction="decreasing")
    return max(num_heads - int(0.2 * num_heads), elbow.elbow if elbow.elbow else max_clusters)

def get_model_size(model, path="temp_model.pth"):
    """ Saves model temporarily and checks disk size. """
    torch.save(model.state_dict(), path)
    size_mb = os.path.getsize(path) / (1024 * 1024)  # Convert bytes to MB
    os.remove(path)  #  Clean up after measurement
    return size_mb

def evaluate_model(model, tokenizer, dataset_name):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    """Evaluates model accuracy on the given dataset."""
    dataset_mapping = {
        "sst2": ("glue", "sst2", "sentence"),
        "piqa": ("piqa", "train", "goal"),
        "rte": ("glue", "rte", "sentence1"),
    }

    if dataset_name not in dataset_mapping:
        return {"error": f"Unsupported dataset: {dataset_name}"}

    dataset_source, dataset_subset, text_key = dataset_mapping[dataset_name]

    try:
        dataset = load_dataset(dataset_source, dataset_subset, split="train", trust_remote_code=True)
    except Exception as e:
        return {"error": f"Error loading dataset: {str(e)}"}

    model.eval()
    correct, total = 0, 0
    start_time = time.time()
    with torch.no_grad():
        for example in dataset:
            inputs = tokenizer(example[text_key], return_tensors="pt", padding=True, truncation=True).to(device)
            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=-1)
            correct += (predictions.item() == example["label"])
            total += 1

    end_time = time.time()
    accuracy = (correct / total) * 100 if total > 0 else 0.0
    latency = end_time - start_time
    return accuracy, latency

def apply_pruning(model, tokenizer,dataset_name):
    size_before = get_model_size(model)

    print("\n Evaluating Accuracy Before Any Modification...")
    # accuracy_before = evaluate_model(model, tokenizer,dataset_name)
    # print(f" Original Accuracy: {accuracy_before:.2f}%\n")

    #  Measure Original Model Size
    #  Compute Attention Scores
    print("\n Computing Attention Scores...")
    input_ids = torch.randint(0, 50256, (1, 32))  # Random input for attention extraction
    attention_scores = get_attention_scores(model, input_ids)

    #  Apply Clustering to All Layers
    print("\n Clustering Attention Heads...")
#  Ensure layers exist before accessing
    if not attention_scores:
        raise ValueError(" No attention scores extracted! Check if model supports output_attentions.")

    available_layers = list(attention_scores.keys())
    print(f" Available Layers for Clustering: {available_layers}")

    clustered_heads = [
        cluster_heads(attention_scores[layer], get_optimal_clusters(attention_scores[layer]))
        for layer in available_layers
    ]


    #  Apply Clustering & Pruning (CHAI-Base)
    print("\n Applying Clustering and Pruning (CHAI-Base)...")
    chai_base_model = prune_attention_heads(model, clustered_heads)
    print("got heads")
    return chai_base_model

2025-02-15 21:30:39.295210: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-15 21:30:39.295231: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-15 21:30:39.296304: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-15 21:30:39.302379: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import torch
from transformers import AutoConfig, Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import copy
import numpy as np

def compute_sensitivity(attention_scores):
    # Debugging: Print attention scores
    print("Raw Attention Scores:", attention_scores)

    # Compute absolute mean for each layer
    cleaned_scores = {
        layer_idx: np.mean(np.abs(np.array(scores, dtype=np.float32)))
        for layer_idx, scores in attention_scores.items()
        if isinstance(scores, (list, np.ndarray)) and len(scores) > 0  # Ensure non-empty values
    }
    
    # Debugging: Print computed sensitivities
    print("Computed Sensitivities:", cleaned_scores)
    
    return cleaned_scores
def get_attention_scores(model, input_ids):
    """Extracts attention scores from the model while ensuring correct dimensions."""
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_ids = input_ids.to(device)  #  Move input IDs to GPU
    model.to(device)
    
    attention_scores = {}

    with torch.no_grad():
        outputs = model(input_ids)

        if hasattr(outputs, "attentions") and outputs.attentions is not None:
            for layer_idx, attn in enumerate(outputs.attentions):
                attn = attn.cpu().numpy()  #  Move to CPU
                if attn.ndim == 4:  # Expected shape: (batch_size, num_heads, seq_length, seq_length)
                    attn = np.mean(attn, axis=(0, 2, 3))  #  Average across heads and sequences
                elif attn.ndim == 3:  # Unexpected case, still averaging
                    attn = np.mean(attn, axis=(0, 2))
                elif attn.ndim == 2:  # More unexpected cases
                    attn = np.mean(attn, axis=0)
                
                attention_scores[layer_idx] = attn  #  Store correctly processed attention scores

        else:
            return {"error": "Model does not return attention scores. Check model architecture."}

    return attention_scores
def main_chai_target(model, tokenizer, dataset_name, epochs=30, batch_size=16, learning_rate=2e-5):
    """
    Identifies and perturbs the most sensitive layers of the model based on accuracy drop.
    Returns the modified model with targeted layers fine-tuned.

    Args:
        model (torch.nn.Module): The pre-trained model to analyze.
        tokenizer: Tokenizer associated with the model.
        dataset_name (str): Name of the dataset to use ('rte', 'piqa', or 'sst2').
        epochs (int, optional): Number of fine-tuning epochs. Defaults to 3.
        batch_size (int, optional): Batch size for training. Defaults to 16.
        learning_rate (float, optional): Learning rate for fine-tuning. Defaults to 2e-5.

    Returns:
        torch.nn.Module: The model after targeted fine-tuning on sensitive layers.
    """
    print(" [chai-target] Before modification: model parameters")
    for name, param in model.named_parameters():
      print(f"  {name}: {param.shape}")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
# If model is in half precision and running on CPU, convert to float32
    if device.type == "cpu":
        model = model.float()  #  Converts model weights to float32 for CPU compatibility

    # Mapping dataset names to Hugging Face's dataset format
    dataset_mapping = {
        "sst2": ("glue", "sst2"),
        "rte": ("glue", "rte"),
        "piqa": ("piqa", "validation"),
    }

    if dataset_name not in dataset_mapping:
        raise ValueError(f"Dataset '{dataset_name}' is not supported.")

    dataset_source, dataset_subset = dataset_mapping[dataset_name]
    dataset = load_dataset(dataset_source, dataset_subset, split="train")

    # Determine the appropriate text column
    text_column = 'sentence' if 'sentence' in dataset.column_names else 'goal'

    # Preprocessing function for tokenization
    def preprocess_function(examples):
        return tokenizer(examples[text_column], truncation=True, padding='max_length', max_length=512)  

    # Tokenize the dataset
    tokenized_dataset = dataset.map(preprocess_function, batched=True)
    tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    train_loader = DataLoader(tokenized_dataset, batch_size=batch_size, shuffle=True)

    # Identify the most sensitive layers
    sensitivities = []

    # Placeholder: Manually selected sensitive layers
    input_ids = torch.randint(0, 50256, (1, 32))
    attention_scores = get_attention_scores(model, input_ids)
    sensitivities = compute_sensitivity(attention_scores)
# Compute the threshold for top 30% (Only if sensitivities are not empty)
    if sensitivities:
        num_layers = len(sensitivities)

        # Compute the threshold for top 30%
        top_k = int(0.3 * num_layers)

        # Get the indices of the top 30% most sensitive layers
        top_layer_indices = torch.argsort(sensitivities, descending=True)[:top_k]
        targeted_layers= top_layer_indices.tolist()
        print(f"Identified Targeted Layers: {targeted_layers}")
    else:
        print(" Warning: No sensitivities detected. Skipping targeted fine-tuning.")
        targeted_layers = [2,3,4]  #  Skip fine-tuning if no layers are detected

    print(f"Identified Targeted Layers: {targeted_layers}")


    # 🔹 **Fine-Tune Only Targeted Layers**
    print("Starting fine-tuning on targeted layers...")

    # Freeze all layers except the targeted layers
    for layer_num, layer in enumerate(model.model.decoder.layers):
        for param in layer.parameters():
            param.requires_grad = layer_num in targeted_layers  # Only fine-tune targeted layers

        # Define training arguments
    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        evaluation_strategy="epoch" if dataset_name != "piqa" else "no",  #  Fix for missing eval dataset
        save_strategy="epoch",
        learning_rate=learning_rate,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        eval_dataset=tokenized_dataset,  #  Ensuring evaluation dataset is provided
    )

    # Fine-tune the model
    trainer.train()



    # Fine-tune the model
    trainer.train()

    # Ensure model is correctly returned
    if hasattr(trainer.model, "module"):
        return trainer.model.module  # Extract if wrapped in DataParallel
    for name, param in model.named_parameters():
        print(f"  {name}: {param.shape}")

    return trainer.model  # Returning the fine-tuned model


In [3]:
dataset_name = "sst2"
model_name = "facebook/opt-350m"
model = OPTForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
    
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = main_chai_target(model, tokenizer, dataset_name)
def evaluate_model(model, tokenizer, dataset_name):
    """Evaluates model accuracy on the given dataset."""

    #  Dataset mappings for correct loading
    dataset_mapping = {
        "sst2": ("glue", "sst2", "validation", "sentence"),
        "rte": ("glue", "rte", "validation", ("sentence1", "sentence2")),  # RTE has two sentences
        "piqa": ("piqa", None, "validation", ("goal", "sol1", "sol2")),  # PIQA has different format
    }

    if dataset_name not in dataset_mapping:
        return {"error": f"Unsupported dataset: {dataset_name}"}

    dataset_source, dataset_subset, split_name, text_key = dataset_mapping[dataset_name]

    #  Load dataset with cache handling
    try:
        dataset = load_dataset(dataset_source, dataset_subset, split=split_name, cache_dir="./cache") if dataset_subset \
            else load_dataset(dataset_source, split=split_name, cache_dir="./cache")
    except Exception as e:
        return {"error": f"Error loading dataset: {str(e)}"}

    dataloader = DataLoader(dataset, batch_size=16)

    #  Enable GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    #  Ensure correct precision (avoid float16 on CPU)
    if device.type == "cpu":
        model.to(torch.float32)

    start_time = time.time()
    correct, total = 0, 0

    with torch.no_grad():
        for batch in dataloader:
            #  Handle multiple input fields correctly
            if isinstance(text_key, tuple):  # RTE & PIQA
                inputs = tokenizer(*[batch[key] for key in text_key], return_tensors="pt", padding=True, truncation=True).to(device)
            else:
                inputs = tokenizer(batch[text_key], return_tensors="pt", padding=True, truncation=True).to(device)

            #  FIX: Correct tensor creation and movement
            labels = torch.tensor(batch["label"], dtype=torch.long).to(device, non_blocking=True)

            outputs = model(**inputs)
            predictions = torch.argmax(F.softmax(outputs.logits, dim=-1), dim=-1)

            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    end_time = time.time()
    accuracy = (correct / total) * 100 if total > 0 else 0.0
    latency = end_time - start_time

    return [accuracy, latency]

dataset_name = "piqa"
model_name = "facebook/opt-350m"
model = OPTForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
    
tokenizer = AutoTokenizer.from_pretrained(model_name)
[a1, b1] = evaluate_model_piqa(model, tokenizer,dataset, num_classes=2)


#  Apply Pruning
model = apply_pruning(model, tokenizer, dataset_name)

print("\n Initial Model Evaluation:")
print(f"Accuracy: {a1:.2f}%")
print(f"Latency: {b1:.4f} sec")
print("------------------------------------------------------------------")

#  Measure Size After Pruning
print("\n Chai Base Model Evaluation (After Pruning)")
size2 = get_model_size(model)
print(f"Reduction Percentage: {(size1 - size2) * 100 / size1:.2f}%")

[a, b] = evaluate_model_piqa(model, tokenizer,dataset, num_classes=2)
print(f"Accuracy: {a:.2f}%")
print(f"Latency: {b:.4f} sec")
print("------------------------------------------------------------------")

#  Apply CHAI-Quant Enhancement
model = main_chai_target(model, tokenizer)
size3 = get_model_size(model)
print("\n Applied Methods: TARGETTED FINE TUNING")
[a, b] = evaluate_model_piqa(model, tokenizer,dataset, num_classes=2)

print("\n Final Model Evaluation (After CHAI-TARGET)")
print(f"Accuracy after applying methods: {a:.2f}%")
print(f"Latency: {b:.4f} sec")
print(f"Reduction Percentage: {(size2 - size3) * 100 / size2:.2f}%")

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔹 [chai-target] Before modification: model parameters
  model.decoder.embed_tokens.weight: torch.Size([50272, 512])
  model.decoder.embed_positions.weight: torch.Size([2050, 1024])
  model.decoder.project_out.weight: torch.Size([512, 1024])
  model.decoder.project_in.weight: torch.Size([1024, 512])
  model.decoder.layers.0.self_attn.k_proj.weight: torch.Size([1024, 1024])
  model.decoder.layers.0.self_attn.k_proj.bias: torch.Size([1024])
  model.decoder.layers.0.self_attn.v_proj.weight: torch.Size([1024, 1024])
  model.decoder.layers.0.self_attn.v_proj.bias: torch.Size([1024])
  model.decoder.layers.0.self_attn.q_proj.weight: torch.Size([1024, 1024])
  model.decoder.layers.0.self_attn.q_proj.bias: torch.Size([1024])
  model.decoder.layers.0.self_attn.out_proj.weight: torch.Size([1024, 1024])
  model.decoder.layers.0.self_attn.out_proj.bias: torch.Size([1024])
  model.decoder.layers.0.self_attn_layer_norm.weight: torch.Size([1024])
  model.decoder.layers.0.self_attn_layer_norm.bias: tor

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Raw Attention Scores: {'error': 'Model does not return attention scores. Check model architecture.'}
Computed Sensitivities: {}
Identified Targeted Layers: [2, 3, 4]
Starting fine-tuning on targeted layers...




Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB. GPU 0 has a total capacity of 23.68 GiB of which 68.69 MiB is free. Process 3532101 has 1.57 GiB memory in use. Process 3532317 has 22.03 GiB memory in use. Of the allocated memory 21.50 GiB is allocated by PyTorch, and 288.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)