In [7]:
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision
import torchvision.transforms as T
import torchvision.datasets as datasets
import torchvision.models as models

In [2]:
import torch.profiler

In [4]:
# model in pytorch repo with weights
model = models.resnet50(pretrained=True)
model.cuda() # load in GPU
cudnn.benchmark = True

In [8]:
# pre-process images
transform = T.Compose([T.Resize(256), T.CenterCrop(224), T.ToTensor()])
# Dataset load
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
# Loading
trainloader = torch.utils.data.DataLoader(trainset, batch_size=16,
                                          shuffle=True)
# calc loss (target and training) - and minimize it
criterion = nn.CrossEntropyLoss().cuda()
# back propagation
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

device = torch.device("cuda:0")

Files already downloaded and verified


In [9]:
# switch to training mode
model.train()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [12]:
def output_fn(p):
    p.export_chrome_trace("./trace/resnet50_record/worker0.pt.trace.json")

# add context manager around training loop
with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA],
    schedule=torch.profiler.schedule(
        wait=2, # skip first 2 training steps
        warmup=3, # reach steady and skip few layers, profiling happens ignores results
        active=6), # only profile 6 steps - allows to focus and skip some layers for reducing overhead(even in prod)
    on_trace_ready=output_fn,
    record_shapes=True
) as p:
    for step, data in enumerate(trainloader, 0):
        print("step:{}".format(step))
        inputs, labels = data[0].to(device=device), data[1].to(device=device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        p.step()
        if step + 1 >= 11:
            break


step:0
step:1
step:2
step:3
step:4
step:5
step:6
step:7
step:8
step:9
step:10
