In [1]:
!pip install -q pyRAPL transformers datasets accelerate

import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from torch.profiler import profile, record_function, ProfilerActivity
import cProfile
import pyRAPL
import subprocess
import time 

# Configuration
MODEL_NAME = "gpt2"
BATCH_SIZE = 8
MAX_LENGTH = 512
PRUNE_RATIO = 0.5
DEVICE = "cpu"  # Force CPU

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)
model = model.half()  # Convert model weights to float16

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model.eval()

# Load dataset
dataset = load_dataset("wikitext", "wikitext-2-v1", split="test")

# Tokenize
def tokenize(example):
    return tokenizer(example['text'], padding="max_length", truncation=True, max_length=MAX_LENGTH)

tokenized = dataset.map(tokenize, batched=True)
input_ids = torch.tensor(tokenized["input_ids"][:BATCH_SIZE]).to(DEVICE).half()
attention_mask = torch.tensor(tokenized["attention_mask"][:BATCH_SIZE]).to(DEVICE).half()

# Forward hook to capture activations
activation_store = {}

def get_activation(name):
    def hook(module, input, output):
        activation_store[name] = input[0].detach()
    return hook

for name, module in model.named_modules():
    if isinstance(module, nn.Linear):
        module.register_forward_hook(get_activation(name))

# Run one forward pass to capture activations
with torch.no_grad():
    _ = model(input_ids=input_ids, attention_mask=attention_mask)

# Wanda pruning logic
def wanda_prune(model, prune_ratio):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and name in activation_store:
            W = module.weight.data
            A = activation_store[name]  # shape: (batch, in_features)
            A = A.view(-1, A.shape[-1])  # Flatten batch and seq dims → (B*T, hidden_dim)
            avg_A = torch.abs(A).mean(dim=0)  # Shape: (in_features,)
            scores = torch.abs(W) * avg_A.unsqueeze(0)  # broadcasting

            # Flatten and find the global threshold
            num_params = scores.numel()
            k = int((1 - prune_ratio) * num_params)
            topk_scores, _ = torch.topk(scores.view(-1), k, largest=True, sorted=False)
            threshold = topk_scores.min()
            mask = scores >= threshold

            module.weight.data *= mask  # Zero out pruned weights

start_time = time.time()
wanda_prune(model, PRUNE_RATIO)

# Evaluate perplexity
def evaluate_perplexity(model, input_ids, attention_mask):
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
    return torch.exp(loss.float()).item()  # cast loss to float32 for exp()

perplexity = evaluate_perplexity(model, input_ids, attention_mask)
print(f"Perplexity after pruning: {perplexity:.2f}")

# Profile CPU usage
def profile_cpu():
    profiler = cProfile.Profile()
    profiler.enable()
    evaluate_perplexity(model, input_ids, attention_mask)
    profiler.disable()
    profiler.print_stats(sort="cumtime")

profile_cpu()

# Estimate CPU Energy
def estimate_cpu_energy(runtime_seconds, tdp_watts=65):
    return tdp_watts * runtime_seconds / 3600  # Energy in Wh

runtime = time.time() - start_time
print(f"Estimated CPU Energy: {estimate_cpu_energy(runtime):.2f} Wh")

# GPU profiling will not work since we’re on CPU
# Skip GPU-specific profiling and power draw

# Track energy (CPU) with pyRAPL
try:
    pyRAPL.setup()
    @pyRAPL.measureit
    def energy_profile():
        return evaluate_perplexity(model, input_ids, attention_mask)
    energy_profile()
except Exception as e:
    print(f"pyRAPL failed: {e}")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

2025-04-10 10:04:13.928771: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744279454.150477      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744279454.218570      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/685k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.07M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/618k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.HalfTensor instead (while checking arguments for embedding)