In [18]:
from flask import Flask, request, jsonify, render_template
import torch
import torch.nn.functional as F
import numpy as np
import os
from transformers import OPTForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset
from sklearn.cluster import KMeans
from kneed import KneeLocator
import time
from flask import Flask, request, jsonify, render_template
import torch
import torch.nn.functional as F
import numpy as np
import os
from transformers import AutoConfig, AutoModelForSequenceClassification, OPTForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset
from sklearn.cluster import KMeans
from kneed import KneeLocator
import time
from chai_quant import chai_quant_enhancement
from chai_kd import chai_knowledge_distillation_enhancement
from chai_target import main_chai_target
from original_chai import apply_pruning
from torch.utils.data import DataLoader
from tqdm import tqdm
import time
import torch
from tqdm import tqdm
import tempfile
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def get_model_size(model):
    """ Temporarily saves the model and measures its file size in MB. """
    with tempfile.NamedTemporaryFile(delete=False) as temp_file:
        temp_path = temp_file.name
    
    torch.save(model.state_dict(), temp_path)
    size_mb = os.path.getsize(temp_path) / (1024 * 1024)  # Convert bytes to MB
    os.remove(temp_path)  #  Clean up temporary file
    return size_mb
def get_attention_scores(model, input_ids):
    """Extracts attention scores from the model while ensuring correct dimensions."""
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_ids = input_ids.to(device)  #  Move input IDs to GPU
    model.to(device)
    
    attention_scores = {}

    with torch.no_grad():
        outputs = model(input_ids)

        if hasattr(outputs, "attentions") and outputs.attentions is not None:
            for layer_idx, attn in enumerate(outputs.attentions):
                attn = attn.cpu().numpy()  # Move to CPU
                if attn.ndim == 4:  # Expected shape: (batch_size, num_heads, seq_length, seq_length)
                    attn = np.mean(attn, axis=(0, 2, 3))  #  Average across heads and sequences
                elif attn.ndim == 3:  # Unexpected case, still averaging
                    attn = np.mean(attn, axis=(0, 2))
                elif attn.ndim == 2:  # More unexpected cases
                    attn = np.mean(attn, axis=0)
                
                attention_scores[layer_idx] = attn  #  Store correctly processed attention scores

        else:
            return {"error": "Model does not return attention scores. Check model architecture."}

    return attention_scores

def divide_layers_by_sensitivity(sensitivities):
    """ Splits layers into 3 groups (High, Medium, Low) based on sensitivity scores. """
    sorted_layers = sorted(sensitivities, key=sensitivities.get, reverse=True)
    num_layers = len(sorted_layers)
    high = sorted_layers[: int(num_layers * 0.2)]
    medium = sorted_layers[int(num_layers * 0.2) : int(num_layers * 0.7)]
    low = sorted_layers[int(num_layers * 0.7) :]
    return high, medium, low

def apply_mixed_precision(model, medium, low):
    """ Applies mixed precision quantization to the model. """
    for layer_idx in medium + low:
        for param in model.model.decoder.layers[layer_idx].parameters():
            param.data = param.data.half()
    model.half()
    return model
def load_opt_classifier(model_name):
    """Load the specified OPT model and tokenizer."""
    model = OPTForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer
def compute_sensitivity(attention_scores):
    cleaned_scores = {
        layer_idx: np.mean(np.abs(np.array(scores, dtype=np.float32)))
        for layer_idx, scores in attention_scores.items()
        if isinstance(scores, (list, np.ndarray))
    }
    return cleaned_scores
def evaluate_model_piqa(model, tokenizer):
    """ Evaluates model accuracy on PIQA dataset (multiple-choice task). """
    dataset = load_dataset("piqa", split="validation[:100]")

    model.eval()  #  Remove `.to(device)`
    
    correct, total = 0, 0
    start_time = time.time()
    with torch.no_grad():
        for example in dataset:
            prompt = example["goal"]
            choices = [example["sol1"], example["sol2"]]

            inputs = tokenizer(
                [prompt + " " + choice for choice in choices],
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512
            ).to(device)  #  Ensure inputs are on the correct device

            #  No need to move the model to device again
            outputs = model(**inputs, use_cache=False)
            logits = outputs.logits.squeeze()
            predicted_choice = torch.argmax(logits, dim=-1)

            label = torch.tensor(example["label"], device=device)
            correct += (predicted_choice == label).sum().item()
            total += 1

    end_time = time.time()
    latency = end_time - start_time
    return (correct / total) * 100, latency



# -------------------- Evaluation -------------------- #
def evaluate_model_piqa1(model, tokenizer):
    """Evaluates model accuracy on the given dataset."""
    from datasets import load_dataset
    dataset = load_dataset("piqa", split="validation")
    #  Dataset mappings for correct loading
    # dataset_mapping = {
    #     "sst2": ("glue", "sst2", "validation", "sentence"),
    #     "rte": ("glue", "rte", "validation", ("sentence1", "sentence2")),
    #     "piqa": ("piqa", None, "validation", ("goal", "sol1", "sol2")),  # PIQA has different format
    # }

    # if dataset_name not in dataset_mapping:
    #     return {"error": f"Unsupported dataset: {dataset_name}"}

    # dataset_source, dataset_subset, split_name, text_key = dataset_mapping[dataset_name]

    # #  Load dataset with cache handling
    # try:
    #     dataset = load_dataset(dataset_source, dataset_subset, split=split_name, cache_dir="./cache") if dataset_subset \
    #         else load_dataset(dataset_source, split=split_name, cache_dir="./cache")
    # except Exception as e:
    #     return {"error": f"Error loading dataset: {str(e)}"}

    dataloader = DataLoader(dataset, batch_size=16)

    #  Enable GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    #  Ensure correct precision (avoid float16 on CPU)
    if device.type == "cpu":
        model.to(torch.float32)

    start_time = time.time()
    correct, total = 0, 0

    with torch.no_grad():
        for batch in dataloader:
            if dataset_name == "piqa":
                #  Correctly tokenize both solutions separately
                inputs1 = tokenizer(batch["goal"], batch["sol1"], return_tensors="pt", padding=True, truncation=True).to(device)
                inputs2 = tokenizer(batch["goal"], batch["sol2"], return_tensors="pt", padding=True, truncation=True).to(device)

                #  Get logits separately for each solution
                outputs1 = model(**inputs1, use_cache=False).logits
                outputs2 = model(**inputs2, use_cache=False).logits

                #  Predict the solution with higher confidence
                predictions = torch.argmax(torch.stack([outputs1, outputs2], dim=-1), dim=-1).squeeze()
            else:
                #  Handle other datasets (SST2, RTE) normally
                if isinstance(text_key, tuple):
                    inputs = tokenizer(*[batch[key] for key in text_key], return_tensors="pt", padding=True, truncation=True).to(device)
                else:
                    inputs = tokenizer(batch[text_key], return_tensors="pt", padding=True, truncation=True).to(device)

                outputs = model(**inputs)
                predictions = torch.argmax(F.softmax(outputs.logits, dim=-1), dim=-1)

            labels = torch.tensor(batch["label"], dtype=torch.long).to(device, non_blocking=True)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    end_time = time.time()
    accuracy = (correct / total) * 100 if total > 0 else 0.0
    latency = end_time - start_time

    print(f"Accuracy: {accuracy:.2f}%, Latency: {latency:.4f} sec")
    return (accuracy, latency)

def evaluate_model(model, tokenizer, dataset_name):
    """Evaluates model accuracy on the given dataset."""

    #  Dataset mappings for correct loading
    dataset_mapping = {
        "sst2": ("glue", "sst2", "validation", "sentence"),
        "rte": ("glue", "rte", "validation", ("sentence1", "sentence2")),  # RTE has two sentences
        "piqa": ("piqa", None, "validation", ("goal", "sol1", "sol2")),  # PIQA has different format
    }

    if dataset_name not in dataset_mapping:
        return {"error": f"Unsupported dataset: {dataset_name}"}

    dataset_source, dataset_subset, split_name, text_key = dataset_mapping[dataset_name]

    # Load dataset with cache handling
    try:
        dataset = load_dataset(dataset_source, dataset_subset, split=split_name, cache_dir="./cache") if dataset_subset \
            else load_dataset(dataset_source, split=split_name, cache_dir="./cache")
    except Exception as e:
        return {"error": f"Error loading dataset: {str(e)}"}

    dataloader = DataLoader(dataset, batch_size=16)

    #  Enable GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    #  Ensure correct precision (avoid float16 on CPU)
    if device.type == "cpu":
        model.to(torch.float32)

    start_time = time.time()
    correct, total = 0, 0

    with torch.no_grad():
        for batch in dataloader:
            #  Handle multiple input fields correctly
            if isinstance(text_key, tuple):  # RTE & PIQA
                inputs = tokenizer(*[batch[key] for key in text_key], return_tensors="pt", padding=True, truncation=True).to(device)
            else:
                inputs = tokenizer(batch[text_key], return_tensors="pt", padding=True, truncation=True).to(device)

            #  FIX: Correct tensor creation and movement
            labels = torch.tensor(batch["label"], dtype=torch.long).to(device, non_blocking=True)

            outputs = model(**inputs)
            predictions = torch.argmax(F.softmax(outputs.logits, dim=-1), dim=-1)

            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    end_time = time.time()
    accuracy = (correct / total) * 100 if total > 0 else 0.0
    latency = end_time - start_time
    print([accuracy, latency])
    print("[accuracy, latency]")
    a = accuracy
    b = latency
    return (a, b)

def chai_quant_enhancement(chai_base_model,tokenizer,dataset_name):
    #  Save & Reload to Apply Pruning
    #  Measure Model Size After Pruning
    input_ids = torch.randint(0, 50256, (1, 32))
    attention_scores = get_attention_scores(chai_base_model, input_ids)
    sensitivities = compute_sensitivity(attention_scores)
    high, medium, low = divide_layers_by_sensitivity(sensitivities)

    #  Apply Mixed Precision Quantization (CHAI-Quant)
    print("\n Applying Mixed Precision Quantization (CHAI-Quant)...")
    chai_quant_model = apply_mixed_precision(chai_base_model, medium, low)
    print(" [chai-quant] After modification: model parameters")
    for name, param in chai_quant_model.named_parameters():
        print(f"  {name}: {param.shape}")

    return chai_quant_model
# dataset_name = "piqa"
# model_name = "facebook/opt-2.7b"
# model = OPTForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from huggingface_hub import login

#  Log in to Hugging Face (ensure your token has "read" access)
HUGGINGFACE_TOKEN = "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"  # Replace with your actual token
login(HUGGINGFACE_TOKEN)

dataset_name = "piqa"
model_name = "meta-llama/Llama-2-7b-hf"  #  Use LLaMA-2 model
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

#  Define the LLaMA 2 model name
model_name = "meta-llama/Llama-2-7b-hf"

#  Load model with automatic device placement
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    device_map="auto",  #  Automatically assigns model to available GPU(s) & CPU
    torch_dtype=torch.float16,  #  Use half-precision for efficiency
    trust_remote_code=True  #  Needed for some HF models
)

#  Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

print(" LLaMA 2 Model Loaded Successfully")

#  Load tokenizer with padding setup
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token  #  Set EOS token as pad token
tokenizer.padding_side = "left"  #  Important for LLaMA models

print(" LLaMA 2 Model Loaded Successfully")

# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model, tokenizer = load_opt_classifier(model_name)
size1 = get_model_size(model)
(a1,b1)= evaluate_model_piqa(model, tokenizer, dataset_name)

model = apply_pruning(model,tokenizer,dataset_name)

print(a1)
print(b1)
print("------------------------------------------------------------------")

print("chai base ")
size2 = get_model_size(model)
print("reduction percentage")
print((size1-size2)*100/size1)
(a,b)= evaluate_model_piqa(model, tokenizer, dataset_name)
print(a)
print(b)
print("------------------------------------------------------------------")
model = chai_quant_enhancement(model, tokenizer, dataset_name)
size3 = get_model_size(model)

print("applied_methods.append(Quantization)")
(a,b)= evaluate_model_piqa(model, tokenizer, dataset_name)
print("Accuracy after applying methods")
print(a)
print("latency")
print(b)
print("reduction percentage")
print((size2-size3)*100/size2)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /tmp/xdg-cache/huggingface/token
Login successful


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some parameters are on the meta device because they were offloaded to the cpu.


✅ LLaMA 2 Model Loaded Successfully
✅ LLaMA 2 Model Loaded Successfully


ValueError: Cannot handle batch sizes > 1 if no padding token is defined.

In [19]:
a = 2
print("Sad")

Sad
