In [None]:
from collections import OrderedDict
import matplotlib.pyplot as plt
import numpy as np
import os
import shutil
import time

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

import torchvision
import torchvision.transforms as transforms
import torchvision.models as models

In [None]:
def accuracy(output, target, topk=(1,)):
  """Computes the accuracy over the k top predictions for the specified values of k"""
  with torch.no_grad():
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
      correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
      res.append(correct_k.mul_(100.0 / batch_size))
    return res

class ProgressMeter(object):
  def __init__(self, num_batches, meters, prefix=""):
    self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
    self.meters = meters
    self.prefix = prefix

  def display(self, batch):
    entries = [self.prefix + self.batch_fmtstr.format(batch)]
    entries += [str(meter) for meter in self.meters]
    print('\t'.join(entries))

  def _get_batch_fmtstr(self, num_batches):
    num_digits = len(str(num_batches // 1))
    fmt = '{:' + str(num_digits) + 'd}'
    return '[' + fmt + '/' + fmt.format(num_batches) + ']'

class AverageMeter(object):
  """Computes and stores the average and current value"""
  def __init__(self, name, fmt=':f'):
    self.name = name
    self.fmt = fmt
    self.reset()
    self.epoch_sum = 0
    self.epoch_count = 0
    self.epoch_avg = 0

  def reset(self):
#     self.val = 0
    self.avg = 0
    self.sum = 0
    self.count = 0

  def update(self, val, n=1):
    self.val = val
    self.sum += val * n
    self.count += n
    self.avg = self.sum / self.count
    self.epoch_sum += val * n
    self.epoch_count += n
    self.epoch_avg = self.epoch_sum / self.epoch_count
    
  def __str__(self):
    fmtstr = '{name} {avg' + self.fmt + '} ({epoch_avg' + self.fmt + '})'
    return fmtstr.format(**self.__dict__)

In [None]:
%cd /workspace/FixRes/

In [None]:
from imnet_finetune.resnext_wsl import *

model = resnext101_32x48d_wsl(progress=False)

In [None]:
###################################################
## Settings
batch_size = 2

###################################################
## Load Data
# dataloaders = {}
# dataloaders['train'], dataloaders['val'] = get_train_val_loaders('./data', batch_size, val_ratio)
# trainloader, _ =  get_train_val_loaders('./data', batch_size, val_ratio)

# classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
from imnet_finetune.transforms import get_transforms
transformation = get_transforms(input_size=320,test_size=320, kind='full', crop=True, need=('train', 'val'), backbone=None)
trainset = torchvision.datasets.ImageFolder('/workspace/data/train', transform=transformation['val'])
trainloader = DataLoader(trainset, batch_size=batch_size, num_workers=2)
print(trainset)

###################################################
## Load Model
# Detect if we have a GPU available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Define/load model
# num_ftrs = model.fc.in_features
# model.fc = nn.Linear(num_ftrs, 10)
# Send model to GPU
model.to(device)

# Define loss function (criterion) and optimizer and LR scheduler
criterion = nn.CrossEntropyLoss()  
# NOTE: define optimizer after sending model to GPU. May lead to error otherwise.
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) 
#   lrscheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [None]:
# set to train mode
losses = AverageMeter('Loss', ':.4e')
top1 = AverageMeter('Acc@1', ':6.2f')
top5 = AverageMeter('Acc@5', ':6.2f')

model.train()

# batch times
# batchTimes = []
metrics = []

trainiter = iter(trainloader)
# specify which batch you want to profile
batches = 1
correct = 0
total = 0
with torch.autograd.profiler.profile(enabled=True, use_cuda=True, record_shapes=True) as prof:
    for i in range(batches):
        images, target = trainiter.next()
        # time
        torch.cuda.synchronize()
        start = time.time()
        images = images.to(device)
        target = target.to(device)
        
        output = model(images)
        loss = criterion(output, target)
      # compute gradients and do kprop 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # time
        torch.cuda.synchronize()
        end = time.time()
        print("Time: ", end-start)
        # measure accuracy and record loss
#         _, predicted = torch.max(output.data, 1)
#         total += target.size(0)
#         correct += (predicted == target).sum().item()
#         print(' * TRAIN: Acc@1 {:.3f}'.format(correct/total))
# #         metrics.append(' * TRAIN: Acc@1 {:.3f}'.format(correct/total))
#         correct = total = 0
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        top1.update(acc1[0], images.size(0))
        top5.update(acc5[0], images.size(0))
        print(' * TRAIN: Acc@1 {top1.epoch_avg:.3f} Acc@5 {top5.epoch_avg:.3f}'.format(top1=top1, top5=top5))
    
# print(prof)
# print(batchTimes)
print('\n'.join(metrics))

###### run the above cell twice since the profiler output of the first and consequent runs are different

In [None]:
# need to do this otherwise children are not populated (lazy call)
prof.table(row_limit=1)
children = []
events = {}
for evt in prof.function_events:
    children.extend([child.id for child in evt.cpu_children])
    events[evt.id] = evt
children = set(children)
print(len(children))
print(len(events))

In [None]:
mainevts = [evt for evt in prof.function_events if evt.id not in children]
# print([evt.name for evt in mainevts if evt.name not in ['detach_', 'set_', 'zero_']])

In [None]:
pysum = 0
for evt in prof.function_events:
    pysum += evt.cuda_time_total
mysum = 0
for evt in mainevts:
    mysum += evt.cuda_time_total
print(pysum, mysum)

In [None]:
# Calculate running time from the start of first kernel to end of last kernel
mint = mainevts[0].kernels[0].interval.start
maxt = mainevts[0].kernels[0].interval.end
for evt in mainevts:
    mint = min(mint, evt.kernels[0].interval.start)
    maxt = max(maxt, evt.kernels[0].interval.end)
print(maxt-mint)

In [None]:
import pandas as pd

def isCudnnOperation(e):
    if " " in e.name:
        return True
    for child in e.cpu_children:
        iscudnnchild = isCudnnOperation(child)
        if iscudnnchild:
            return True
    return False

mainevts_cudatime = []
for e in mainevts:
    item = [e.name, e.kernels[0].interval.start, e.kernels[0].interval.elapsed_us(), e.input_shapes, not isCudnnOperation(e)]
    mainevts_cudatime.append(item)
# print(len(mainevts_cudatime))
df = pd.DataFrame(mainevts_cudatime, columns=['name','cudaStart', 'cudaDuration', 'inputShapes', 'isNative'])
print(df)

In [None]:
ops = df.groupby(['name'])['cudaDuration'].sum().reset_index(name ='totalCudaTime')
ops.sort_values('totalCudaTime', inplace=True, ascending=False)

total_cuda_time = ops['totalCudaTime'].sum()
# print(total_cuda_time)

ops['%ageCudaTime'] = ops.apply(lambda row: (row.totalCudaTime*100)/total_cuda_time, axis=1)
display(ops.head(10))

In [None]:
# ops.to_csv('/workspace/DeepLearningMisc/resnext101_32x8d_b28-ops.csv', index=False)

In [None]:
print(prof.key_averages().table(sort_by="cuda_time_total"))

In [None]:
# display(df.tail(50))
toIndices = df.index[df['name'] == "to"].tolist()
print(toIndices)

accComputeStartIndex = df.index[df['name'].str.contains("topk")].tolist()
print(accComputeStartIndex)

In [None]:
assert toIndices[2]+1 == toIndices[3], "check starting index"
actdf = df.loc[toIndices[2]:(accComputeStartIndex[0]-1)]
actdf.reset_index(inplace=True, drop=True)
display(actdf)

In [None]:
# actdf.to_csv('/workspace/DeepLearningMisc/resnext101_32x48d_b2-pytorchtrace.csv', sep=';')

In [None]:
tmp = actdf.copy(deep=True)
display(tmp.head())

nativeOps = tmp.index[tmp['isNative'] == True].tolist()
print(len(nativeOps), len(actdf))

gaps = {}
for i in range(len(nativeOps)-1):
    f = nativeOps[i]
    s = nativeOps[i+1]
    if s-f-1 > 1:
        gaps[(f,s)] = s-f-1
print(gaps)    

In [None]:
print(prof.table())

In [None]:
print(model)