In [1]:
!pip install pyRAPL transformers datasets accelerate

import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from torch.profiler import profile, record_function, ProfilerActivity
import cProfile
import pyRAPL
import subprocess
import time

# Configuration
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
BATCH_SIZE = 4
MAX_LENGTH = 512
PRUNE_RATIO = 0.5
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
    device_map="auto" if DEVICE == "cuda" else None
)
model.eval()

# Load dataset
dataset = load_dataset("wikitext", "wikitext-2-v1", split="test")

# Tokenize
def tokenize(example):
    return tokenizer(example['text'], padding="max_length", truncation=True, max_length=MAX_LENGTH)

tokenized = dataset.map(tokenize, batched=True)
input_ids = torch.tensor(tokenized["input_ids"][:BATCH_SIZE]).to(DEVICE)
attention_mask = torch.tensor(tokenized["attention_mask"][:BATCH_SIZE]).to(DEVICE)

# Hook to collect activations
activation_store = {}
def get_activation(name):
    def hook(module, input, output):
        activation_store[name] = input[0].detach()
    return hook

for name, module in model.named_modules():
    if isinstance(module, nn.Linear):
        module.register_forward_hook(get_activation(name))

# Initial forward pass
with torch.no_grad():
    _ = model(input_ids=input_ids, attention_mask=attention_mask)

# Wanda-style pruning
def wanda_prune(model, prune_ratio):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and name in activation_store:
            W = module.weight.data
            A = activation_store[name]
            A = A.view(-1, A.shape[-1])
            avg_A = torch.abs(A).mean(dim=0)
            scores = torch.abs(W) * avg_A.unsqueeze(0)
            k = int((1 - prune_ratio) * scores.numel())
            topk_scores, _ = torch.topk(scores.view(-1), k, largest=True, sorted=False)
            threshold = topk_scores.min()
            mask = scores >= threshold
            module.weight.data *= mask

start_time = time.time()
wanda_prune(model, PRUNE_RATIO)

# Evaluate perplexity
def evaluate_perplexity(model, input_ids, attention_mask):
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
    return torch.exp(loss).item()

perplexity = evaluate_perplexity(model, input_ids, attention_mask)
print(f"Perplexity after pruning: {perplexity:.2f}")

# CPU profile
def profile_cpu():
    profiler = cProfile.Profile()
    profiler.enable()
    evaluate_perplexity(model, input_ids, attention_mask)
    profiler.disable()
    profiler.print_stats(sort="cumtime")

profile_cpu()

# CPU energy estimation
def estimate_cpu_energy(runtime_seconds, tdp_watts=65):
    return tdp_watts * runtime_seconds / 3600

runtime = time.time() - start_time
print(f"Estimated CPU Energy: {estimate_cpu_energy(runtime):.2f} Wh")

# GPU energy via nvidia-smi
def get_gpu_energy():
    result = subprocess.run(
        ["nvidia-smi", "--query-gpu=power.draw", "--format=csv,noheader,nounits"],
        capture_output=True, text=True
    )
    lines = result.stdout.strip().split('\n')
    power_draws = [float(line.strip()) for line in lines if line.strip()]
    return sum(power_draws) / len(power_draws)

# GPU profiler
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
    with record_function("model_inference"):
        evaluate_perplexity(model, input_ids, attention_mask)
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

# pyRAPL CPU energy tracking
pyRAPL.setup()
@pyRAPL.measureit
def energy_profile():
    return evaluate_perplexity(model, input_ids, attention_mask)
energy_profile()

# Display GPU energy
def gpu_energy():
    result = subprocess.check_output([
        "nvidia-smi", "--query-gpu=power.draw", "--format=csv,noheader,nounits"
    ])
    print(f"GPU Power Draw (Watts): {result.decode().strip()}")

gpu_energy()


Collecting pyRAPL
  Downloading pyRAPL-0.2.3.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=2.0.0->accelerate)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=2.0.0->accelerate)
  Downloading nvidi

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

2025-04-10 09:44:37.124890: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744278277.348507      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744278277.416918      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/685k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.07M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/618k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Perplexity after pruning: 5366.48
         5449 function calls (4651 primitive calls) in 0.639 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.639    0.639 174659105.py:75(evaluate_perplexity)
        1    0.608    0.608    0.608    0.608 {method 'item' of 'torch._C.TensorBase' objects}
    292/1    0.001    0.000    0.031    0.031 module.py:1732(_wrapped_call_impl)
    292/1    0.001    0.000    0.031    0.031 module.py:1740(_call_impl)
      2/1    0.000    0.000    0.031    0.031 generic.py:949(wrapper)
        1    0.000    0.000    0.031    0.031 deprecation.py:120(wrapped_func)
        1    0.000    0.000    0.031    0.031 modeling_llama.py:765(forward)
        1    0.000    0.000    0.031    0.031 modeling_llama.py:492(forward)
       22    0.001    0.000    0.028    0.001 modeling_llama.py:301(forward)
       22    0.001    0.000    0.015    0.001 modeling_llama.py:240(forward)
   

PyRAPLCantRecordEnergyConsumption: 