In [1]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time


# For reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Define cache directory (adjust this path if needed for Colab)
cache_dir = "/content/cache"

# Use GPU if available (T4 GPU on Colab)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [None]:
!pip install pyRAPL
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from torch.profiler import profile, record_function, ProfilerActivity
import cProfile
import pyRAPL
import subprocess

# Configuration
MODEL_NAME = "gpt2"
BATCH_SIZE = 8
MAX_LENGTH = 512
PRUNE_RATIO = 0.5
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model.eval()

# Load dataset
dataset = load_dataset("wikitext", "wikitext-2-v1", split="test")

# Tokenize
def tokenize(example):
    return tokenizer(example['text'], padding="max_length", truncation=True, max_length=MAX_LENGTH)

tokenized = dataset.map(tokenize, batched=True)
input_ids = torch.tensor(tokenized["input_ids"][:BATCH_SIZE]).to(DEVICE)
attention_mask = torch.tensor(tokenized["attention_mask"][:BATCH_SIZE]).to(DEVICE)

# Forward hook to capture activations
activation_store = {}

def get_activation(name):
    def hook(module, input, output):
        activation_store[name] = input[0].detach()
    return hook

for name, module in model.named_modules():
    if isinstance(module, nn.Linear):
        module.register_forward_hook(get_activation(name))

# Run one forward pass to capture activations
with torch.no_grad():
    _ = model(input_ids=input_ids, attention_mask=attention_mask)

# Wanda pruning logic
def wanda_prune(model, prune_ratio):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and name in activation_store:
            W = module.weight.data
            A = activation_store[name]  # shape: (batch, in_features)
            A = A.view(-1, A.shape[-1])  # Flatten batch and seq dims → (B*T, hidden_dim)
            avg_A = torch.abs(A).mean(dim=0)  # Shape: (in_features,)
            scores = torch.abs(W) * avg_A.unsqueeze(0)  # broadcasting

            # Flatten and find the global threshold
            num_params = scores.numel()
            k = int((1 - prune_ratio) * num_params)
            topk_scores, _ = torch.topk(scores.view(-1), k, largest=True, sorted=False)
            threshold = topk_scores.min()
            mask = scores >= threshold


            module.weight.data *= mask  # Zero out pruned weights
start_time = time.time()
wanda_prune(model, PRUNE_RATIO)

# Evaluate perplexity
def evaluate_perplexity(model, input_ids, attention_mask):
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
    return torch.exp(loss).item()

perplexity = evaluate_perplexity(model, input_ids, attention_mask)
print(f"Perplexity after pruning: {perplexity:.2f}")

# Profile CPU usage
def profile_cpu():
    profiler = cProfile.Profile()
    profiler.enable()
    evaluate_perplexity(model, input_ids, attention_mask)
    profiler.disable()
    profiler.print_stats(sort="cumtime")

profile_cpu()



def estimate_cpu_energy(runtime_seconds, tdp_watts=65):
    """TDP is your CPU's max power (e.g., 65W for Intel i5)."""
    return tdp_watts * runtime_seconds / 3600  # Energy in Wh

runtime = time.time() - start_time
print(f"Estimated CPU Energy: {estimate_cpu_energy(runtime):.2f} Wh")

def get_gpu_energy():
    result = subprocess.run(
        ["nvidia-smi", "--query-gpu=power.draw", "--format=csv,noheader,nounits"],
        capture_output=True, text=True
    )
    return float(result.stdout.strip())

print(f"GPU Power Draw: {get_gpu_energy()} W")

# Profile GPU (time & memory)
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
    with record_function("model_inference"):
        evaluate_perplexity(model, input_ids, attention_mask)
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

# Track energy (CPU) with pyRAPL
pyRAPL.setup()
@pyRAPL.measureit
def energy_profile():
    return evaluate_perplexity(model, input_ids, attention_mask)
energy_profile()

# Track GPU energy (Watt draw)
def gpu_energy():
    result = subprocess.check_output([
        "nvidia-smi", "--query-gpu=power.draw", "--format=csv,noheader,nounits"
    ])
    print(f"GPU Power Draw (Watts): {result.decode().strip()}")

gpu_energy()


Collecting pyRAPL
  Downloading pyRAPL-0.2.3.1-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading pyRAPL-0.2.3.1-py2.py3-none-any.whl (27 kB)
Installing collected packages: pyRAPL
Successfully installed pyRAPL-0.2.3.1
