In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import time
import os
import sys
import warnings
import transformers
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

transformers.logging.set_verbosity_error()


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_path = "../saved_models/gpt2"

tokenizer = GPT2Tokenizer.from_pretrained(model_path, local_files_only=True)
model = GPT2LMHeadModel.from_pretrained(model_path, local_files_only=True)

if torch.cuda.is_available():
    model = model.to("cuda")
    print("Using GPU.")

print("Model and tokenizer loaded from local directory.")

Using GPU.
Model and tokenizer loaded from local directory.


In [None]:
from torch.profiler import profile, record_function, ProfilerActivity
import numpy as np


if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Sample prompts
prompts = [
    "The quick brown fox jumps over the lazy dog.",
    "Artificial intelligence is transforming the world.",
    "NVIDIA GPUs power the future of AI and gaming.",
    "Reinforcement learning models learn through reward signals.",
    "The sky was a bright shade of blue on a sunny day."
]


def profile_inference(num_runs=5):
    
    
    for run in range(num_runs):
        
        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True)
        inputs = {key: value.to(model.device) for key, value in inputs.items()}

        # Start profiling
        with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
            with record_function("model_inference"):
                # Run inference on the batched input
                outputs = model(**inputs)

        
        print(f"\nRun {run + 1}:")
        print("Sorted by CUDA time total:")
        print(prof.key_averages().table(sort_by="cuda_time_total"))
        

# Run profiling and compute averages
num_runs = 5
profile_inference(num_runs)



Run 1:
Sorted by CUDA time total:
-------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                  model_inference        31.92%      19.844ms       100.00%      62.161ms      62.161ms       7.903ms        10.32%      76.599ms      76.599ms             1  
                                      aten::addmm        14.33%       8.906ms        14.33%       8.906ms     185.542us      29.048ms        37.92%      29.048ms     605.167us            48  
    