In [15]:


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.profiler import profile, record_function, ProfilerActivity


class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc1 = nn.Linear(784, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = torch.flatten(x, 1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

input_data = torch.randn(64, 1, 28, 28)
dataset = TensorDataset(input_data)
dataloader = DataLoader(dataset, batch_size=64)
model = SimpleModel()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Run both profilers simultaneously
with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof_cpu:

    with record_function("model_inference"):
        for data in dataloader:
            inputs = data[0]
            outputs = model(inputs)
            loss = criterion(outputs, torch.zeros(64, dtype=torch.long))
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

# Print profiler results for CPU
print("CPU Profiler Results:")
print(prof_cpu.key_averages().table(sort_by="cpu_time_total", row_limit=5))

# Print profiler results for CUDA
print("\nCUDA Profiler Results:")
print(prof_cuda.key_averages().table(sort_by="cuda_time_total", row_limit=5))


prof.export_chrome_trace("trace.json")

with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    with_stack=True,
) as prof:
    model(inputs)

# Print aggregated stats
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=2))
prof.export_stacks("/tmp/profiler_stacks.txt", "self_cuda_time_total")

from torch.profiler import schedule

my_schedule = schedule(
    skip_first=10,
    wait=5,
    warmup=1,
    active=3,
    repeat=2)

def trace_handler(p):
    output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=10)
    print(output)
    p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json")

with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=torch.profiler.schedule(
        wait=1,
        warmup=1,
        active=2),
    on_trace_ready=trace_handler
) as p:
    for idx in range(8):
        model(inputs)
        p.step()



CPU Profiler Results:
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference        16.20%     860.000us       100.00%       5.310ms       5.310ms     797.60 Kb    -228.01 Kb             1  
                               Optimizer.step#Adam.step        30.55%       1.622ms        42.18%       2.240ms       2.240ms     795.09 Kb    -795.08 Kb             1  
enumerate(DataLoader)#_SingleProcessDataLoaderIter._...         7.70%     409.000us        14.48%     769.000us     384.500us   

STAGE:2024-04-23 10:48:27 1984225:1984225 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2024-04-23 10:48:27 1984225:1984225 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2024-04-23 10:48:27 1984225:1984225 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


NameError: name 'prof' is not defined

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.profiler import profile, record_function, ProfilerActivity

# Define the SimpleModel class
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc1 = nn.Linear(784, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = torch.flatten(x, 1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Create input data, dataset, and dataloader
input_data = torch.randn(64, 1, 28, 28)
dataset = TensorDataset(input_data)
dataloader = DataLoader(dataset, batch_size=64)

# Create model, criterion, and optimizer
model = SimpleModel()
# model.cuda()  # Move model to CUDA device
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


model1 = SimpleModel()
model1.cuda()  # Move model to CUDA device
criterion1 = nn.CrossEntropyLoss()
optimizer1 = optim.Adam(model.parameters(), lr=0.001)

# Run both profilers simultaneously
with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof_cpu:
    with record_function("cpu_model_inference"):
        for data in dataloader:
            inputs = data[0]  # No need to move input data to CUDA device within CPU context
            outputs = model(inputs)
            loss = criterion(outputs, torch.zeros(64, dtype=torch.long))  # No need to move target to CUDA device within CPU context
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
with profile(activities=[ProfilerActivity.CUDA], profile_memory=True, record_shapes=True) as prof_cuda:
    with record_function("cuda_model_inference"):
        for data in dataloader:
            inputs = data[0].cuda()  # Move input data to CUDA device within CUDA context
            outputs = model1(inputs)
            loss = criterion1(outputs, torch.zeros(64, dtype=torch.long).cuda())  # Move target to CUDA device within CUDA context
            loss.backward()
            optimizer1.step()
            optimizer1.zero_grad()

# Print profiler results for CPU
print("CPU Profiler Results:")
print(prof_cpu.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=5))

# Print profiler results for CUDA
print("\nCUDA Profiler Results:")
print(prof_cuda.key_averages(group_by_input_shape=True).table(sort_by="cuda_time_total", row_limit=5))

# Export profiler trace
prof_cpu.export_chrome_trace("cpu_trace.json")
prof_cuda.export_chrome_trace("cuda_trace.json")



with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    on_trace_ready=lambda trace: trace.export_chrome_trace("/tmp/trace_{}.json".format(trace.step_num))
) as p:
    for idx in range(10):  
        inputs = input_data.cuda()  
        model1(inputs)

print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=2))



CPU Profiler Results:
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls                                  Input Shapes  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------  
                                    cpu_model_inference        19.07%       1.524ms       100.00%       7.992ms       7.992ms     993.60 Kb     -32.01 Kb             1                                            []  
                               Optimizer.step#Adam.step        19.68%       1.573ms        29.98%       2.396ms   

STAGE:2024-04-23 10:56:27 1984773:1984773 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2024-04-23 10:56:27 1984773:1984773 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2024-04-23 10:56:27 1984773:1984773 ActivityProfilerController.cpp:322] Completed Stage: Post Processing
STAGE:2024-04-23 10:56:27 1984773:1984773 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2024-04-23 10:56:27 1984773:1984773 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2024-04-23 10:56:27 1984773:1984773 ActivityProfilerController.cpp:322] Completed Stage: Post Processing
