In [1]:
import sys
sys.path.append("../")
%load_ext autoreload
%autoreload 2
%matplotlib inline

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
#https://github.com/szagoruyko/pytorchviz

In [4]:
import torch
import torchvision
import torchvision.transforms as transforms

In [5]:
import tensorboardX
print("torch:",torch.__version__)
print("tensorboardX:",tensorboardX.__version__)

torch: 1.3.1
tensorboardX: 2.0


In [6]:
import argparse
import os
import random
import shutil
import time
import warnings

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models

model_names = sorted(name for name in models.__dict__
    if name.islower() and not name.startswith("__")
    and callable(models.__dict__[name]))

best_acc1 = 0


def validate(val_loader, model, criterion, args):
    batch_time = AverageMeter('Time', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(val_loader),
        [batch_time, losses, top1, top5],
        prefix='Test: ')

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (images, target) in enumerate(val_loader):
            if args.gpu is not None:
                images = images.cuda(args.gpu, non_blocking=True)
            target = target.cuda(args.gpu, non_blocking=True)

            # compute output
            output = model(images)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % args.print_freq == 0:
                progress.display(i)

        # TODO: this should also be done with the ProgressMeter
        print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
              .format(top1=top1, top5=top5))

    return top1.avg


def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)


class ProgressMeter(object):
    def __init__(self, num_batches, meters, prefix=""):
        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
        self.meters = meters
        self.prefix = prefix

    def display(self, batch):
        entries = [self.prefix + self.batch_fmtstr.format(batch)]
        entries += [str(meter) for meter in self.meters]
        print('\t'.join(entries))

    def _get_batch_fmtstr(self, num_batches):
        num_digits = len(str(num_batches // 1))
        fmt = '{:' + str(num_digits) + 'd}'
        return '[' + fmt + '/' + fmt.format(num_batches) + ']'


def adjust_learning_rate(optimizer, epoch, lr):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = lr * (0.1 ** (epoch // 30))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res


In [7]:
model_names

['alexnet',
 'densenet121',
 'densenet161',
 'densenet169',
 'densenet201',
 'googlenet',
 'inception_v3',
 'mnasnet0_5',
 'mnasnet0_75',
 'mnasnet1_0',
 'mnasnet1_3',
 'mobilenet_v2',
 'resnet101',
 'resnet152',
 'resnet18',
 'resnet34',
 'resnet50',
 'resnext101_32x8d',
 'resnext50_32x4d',
 'shufflenet_v2_x0_5',
 'shufflenet_v2_x1_0',
 'shufflenet_v2_x1_5',
 'shufflenet_v2_x2_0',
 'squeezenet1_0',
 'squeezenet1_1',
 'vgg11',
 'vgg11_bn',
 'vgg13',
 'vgg13_bn',
 'vgg16',
 'vgg16_bn',
 'vgg19',
 'vgg19_bn',
 'wide_resnet101_2',
 'wide_resnet50_2']

In [8]:
data_path = "~/image_net"

In [9]:
data_path

'~/image_net'

In [10]:
!ls {data_path}

train  val


In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)

cuda:0


In [12]:
arch='resnet18'
lr=0.1

In [13]:
global best_acc1
batch_size = 4
# create model
# if args.pretrained:
#     print("=> using pre-trained model '{}'".format(args.arch))
#     model = models.__dict__[args.arch](pretrained=True)
# else:
print("=> creating model '{}'".format(arch))
model = models.__dict__[arch]()

model.to(device)


# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.SGD(model.parameters(), lr=lr,
                            momentum=0.9,
                            weight_decay=1e-4)


# Data loading code
traindir = os.path.join(data_path, 'train')
valdir = os.path.join(data_path, 'val')
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])

train_dataset = datasets.ImageFolder(
    traindir,
    transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ]))


train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=batch_size,
    num_workers=8)

# val_loader = torch.utils.data.DataLoader(
#     datasets.ImageFolder(valdir, transforms.Compose([
#         transforms.Resize(256),
#         transforms.CenterCrop(224),
#         transforms.ToTensor(),
#         normalize,
#     ])),
#     batch_size=batch_size, shuffle=False,
#     num_workers=2)

# if args.evaluate:
#     validate(val_loader, model, criterion, args)
# else:
    

=> creating model 'resnet18'


In [14]:
device

device(type='cuda', index=0)

In [19]:
# CONFIGURE
data_root = 'graph_web/session_data'
session_id = "test10"

from modelinspector.inspector import Inspector
inspector = Inspector(session_id,data_root)

In [17]:
epoch = 0
itr_total = 0

In [18]:
state_log_freq = 13
metric_log_freq = 2

while epoch < 2:
        adjust_learning_rate(optimizer, epoch, lr)
        batch_time = AverageMeter('Time', ':6.3f')
        data_time = AverageMeter('Data', ':6.3f')
        losses = AverageMeter('Loss', ':.4e')
        top1 = AverageMeter('Acc@1', ':6.2f')
        top5 = AverageMeter('Acc@5', ':6.2f')
        progress = ProgressMeter(
            len(train_loader),
            [batch_time, data_time, losses, top1, top5],
            prefix="Epoch: [{}]".format(epoch))

        # switch to train mode
        model.train()

        end = time.time()
        for i, (images, target) in enumerate(train_loader):
            # measure data loading time
            data_time.update(time.time() - end)
            images = images.to(device)
            target = target.to(device)

            # compute output
            output = model(images)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))

            # compute gradient and do SGD step
            optimizer.zero_grad()
            loss.backward()
            try:

                if i % state_log_freq == 0:
                    inspector.log_state(epoch=epoch,
                              itr=i, 
                              model=model,
                              input_dict={'input.1':images},
                              output_dict={'output.1':output},
                              loss_dict={'loss':loss},
                              label_dict={'class_label':target})
                    progress.display(i)


                if i % metric_log_freq == 0 or i % state_log_freq == 0:
                    inspector.log_metrics(
                        epoch=epoch,
                        itr=i, 
                        metrics={
                            'loss':loss.item(),
                            'acc1':acc1[0].item(),
                            'acc5':acc5[0].item()})
            except Exception as e:
                print(e)
                        
            optimizer.step()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            itr_total +=1
        epoch+=1


Computing graph (0, 0)..


  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  ret = ret.dtype.type(ret / rcount)


Saving 0_0
graph_web/session_data/test10/0_0
dict_keys(['id', 'meta', 'step_info', 'additional_info', 'graph_id', 'data', 'graph'])
Epoch: [0][   0/4852]	Time  0.000 ( 0.000)	Data  0.190 ( 0.190)	Loss 6.8029e+00 (6.8029e+00)	Acc@1   0.00 (  0.00)	Acc@5   0.00 (  0.00)
Saving 0_13
graph_web/session_data/test10/0_13
dict_keys(['id', 'meta', 'step_info', 'additional_info', 'graph_id', 'data', 'graph'])
Epoch: [0][  13/4852]	Time  0.045 ( 0.557)	Data  0.001 ( 0.016)	Loss 4.4524e+01 (2.0677e+01)	Acc@1  50.00 ( 69.64)	Acc@5  50.00 ( 83.93)
Saving 0_26
graph_web/session_data/test10/0_26
dict_keys(['id', 'meta', 'step_info', 'additional_info', 'graph_id', 'data', 'graph'])
Epoch: [0][  26/4852]	Time  0.022 ( 0.526)	Data  0.002 ( 0.009)	Loss 7.0041e+00 (1.8518e+01)	Acc@1   0.00 ( 38.89)	Acc@5   0.00 ( 86.11)
Saving 0_39
graph_web/session_data/test10/0_39
dict_keys(['id', 'meta', 'step_info', 'additional_info', 'graph_id', 'data', 'graph'])
Epoch: [0][  39/4852]	Time  0.031 ( 0.511)	Data  0.003 

Saving 0_403
graph_web/session_data/test10/0_403
dict_keys(['id', 'meta', 'step_info', 'additional_info', 'graph_id', 'data', 'graph'])
Epoch: [0][ 403/4852]	Time  0.069 ( 0.495)	Data  0.002 ( 0.002)	Loss 6.7980e+00 (7.8610e+00)	Acc@1   0.00 (  2.60)	Acc@5   0.00 (  9.84)
Saving 0_416
graph_web/session_data/test10/0_416
dict_keys(['id', 'meta', 'step_info', 'additional_info', 'graph_id', 'data', 'graph'])
Epoch: [0][ 416/4852]	Time  0.041 ( 0.495)	Data  0.001 ( 0.002)	Loss 6.8442e+00 (7.8486e+00)	Acc@1   0.00 (  2.52)	Acc@5   0.00 (  9.53)
Saving 0_429
graph_web/session_data/test10/0_429
dict_keys(['id', 'meta', 'step_info', 'additional_info', 'graph_id', 'data', 'graph'])
Epoch: [0][ 429/4852]	Time  0.065 ( 0.495)	Data  0.002 ( 0.002)	Loss 7.0236e+00 (7.8355e+00)	Acc@1   0.00 (  2.44)	Acc@5   0.00 (  9.24)
Saving 0_442
graph_web/session_data/test10/0_442
dict_keys(['id', 'meta', 'step_info', 'additional_info', 'graph_id', 'data', 'graph'])
Epoch: [0][ 442/4852]	Time  0.050 ( 0.496)	Da

KeyboardInterrupt: 

In [None]:
save_last_state(inspector,session_root)

In [None]:
inspector.log_state(epoch=epoch,
          itr=i, 
          model=model,
          input_dict={'input.1':images},
          output_dict={'output.1':output},
          loss_dict={'loss':loss},
          label_dict={'class_label':target})
inspector.compute_stats()
save_last_state(inspector,session_root)
progress.display(i)


In [None]:
!ls -lstrh {state_path}