In [1]:
# Use a pipeline as a high-level helper
from transformers import pipeline
from pyJoules.energy_meter import measure_energy
from pyJoules.handler.pandas_handler import PandasHandler
import tiktoken

pipe = pipeline("text-generation", model="Qwen/Qwen2.5-3B-Instruct")

pandas_handler = PandasHandler()

@measure_energy(handler=pandas_handler)
def generate_response(messages):
    return pipe(messages)

messages = [
    {'role': 'system', 'content': 'You are a helpful assistant.'},  
    {"role": "user", "content": "Tell me all you know about London"},
]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [2]:
text = generate_response(messages)[0]['generated_text'][2]['content']

df = pandas_handler.get_dataframe()

energy_microjoules = df.filter(like='nvidia_gpu').sum(axis=1)

energy_joules = energy_microjoules / 1e6

total_joules = energy_joules.iloc[-1]

n_tokens = len(tiktoken.encoding_for_model("gpt-4").encode(text))

print(f"Total energy: {total_joules:.6f} J")
print(f"Number of tokens: {n_tokens}")
print(f"Energy per token: {total_joules / n_tokens:.6f} J")

Total energy: 4.793036 J
Number of tokens: 251
Energy per token: 0.019096 J
