In [2]:

from utils import getModelWithOptimized

def printStats(model):
    parameters = sum(p.data.nelement() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'trainabel: {trainable:,d}')
    print(f'total:     {parameters:,d}')

archs = ['densenet121', 'densenet169', 'resnet18', 'resnet101', 'msdnet4', 'msdnet5', 'msdnet10']

for arch in archs:
    net = getModelWithOptimized(arch, n=0, batch_size=1)
    print(f'\n***** Stats for {arch} *****')
    printStats(net)


***** Stats for densenet121 *****
trainabel: 6,994,856
total:     6,994,856

***** Stats for densenet169 *****
trainabel: 12,551,080
total:     12,551,080

***** Stats for resnet18 *****
trainabel: 11,189,352
total:     11,189,352

***** Stats for resnet101 *****
trainabel: 42,574,440
total:     42,574,440
building network of steps: 
[4, 4, 4, 4] 16
 ********************** Block 1  **********************
|		inScales 4 outScales 4 inChannels 32 outChannels 16		|

|		inScales 4 outScales 4 inChannels 48 outChannels 16		|

|		inScales 4 outScales 4 inChannels 64 outChannels 16		|

|		inScales 4 outScales 4 inChannels 80 outChannels 16		|

 ********************** Block 2  **********************
|		inScales 4 outScales 3 inChannels 96 outChannels 16		|
|		Transition layer inserted! (max), inChannels 112, outChannels 56	|

|		inScales 3 outScales 3 inChannels 56 outChannels 16		|

|		inScales 3 outScales 3 inChannels 72 outChannels 16		|

|		inScales 3 outScales 3 inChannels 88 outChannels 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# copy anytimeDnn data
#!cp -r drive/My\ Drive/reducedAnytimeDnn/* .
!mkdir data
!cp -r drive/My\ Drive/reducedAnytimeDnn/data/ImagenetDataset.py ./data/ImagenetDataset.py
!cp -r drive/My\ Drive/reducedAnytimeDnn/data/__init__.py ./data/__init__.py
!cp -r drive/My\ Drive/reducedAnytimeDnn/densenet .
!cp -r drive/My\ Drive/reducedAnytimeDnn/msdnet .
!cp -r drive/My\ Drive/reducedAnytimeDnn/resnet .
!ls

In [None]:
#!pip install -r drive/My\ Drive/reducedAnytimeDnn/requirements.txt
#!pip3 install torch===1.6.0 torchvision===0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
#!conda install pytorch==1.5.0 torchvision==0.6.0 cudatoolkit=10.1 -c pytorch
!nvidia-smi
#!pip install numpy
#!pip uninstall torch torchvision
#!pip install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html


In [1]:
import torch
torch.cuda.empty_cache()
print(torch.version.cuda)
print(torch.cuda.is_available())
print(torch.backends.cudnn.enabled)

10.2
False
True


In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.backends.cudnn as cudnn

from msdnet.dataloader import get_dataloaders_alt
from resnet import ResNet
import densenet.densenet as dn

from data.ImagenetDataset import get_zipped_dataloaders

import os
import shutil
import time
import datetime
import sys
import logging

from utils import *

################################- Constants for Checkpoints -###################################
# File name containing checkpoint for given architecture: <arch_name>_<EPOCH>_checkpoint.pth.tar
LAST_CHECKPOINT_EPOCH = 0
# True: Resume from a checkpoint file stored in the checkpoint subdirectory 
# or use default if none is found
# False: Do not resume from any possible checkpoint file
RESUME = True
ARCH = 'resnet50'
ARCH_NAMES = ['resnet50', 'resnet101', 'resnet152', 'densenet121', 'densenet169']
################################################################################################

IS_DEBUG = True
DEBUG_ITERATIONS = 40
STAT_FREQUENCY = 10#200
LEARNING_RATE = 0.1
MOMENTUM = 0.9
WEIGHT_DECAY = 1e-4
GPU_ID = None
START_EPOCH = 0
EPOCHS = 90
CHECKPOINT_INTERVALL = 2 


#CHECKPOINT_DIR = 'drive/My Drive/reducedAnytimeDnn/checkpoints'
CHECKPOINT_DIR = 'checkpoints'
# for repo:
# raw images
# DATA_PATH = "data/imagenet_images"
# zipped preprocessed images
DATA_PATH = "data/imagenet_full"
# for colab:
# DATA_PATH = "drive/My Drive/reducedAnytimeDnn/data/imagenet_images"
BATCH_SIZE = 5#16
NUM_WORKERS = 1

def main(argv):
    torch.cuda.empty_cache()

    n_gpus_per_node = torch.cuda.device_count()
    logging.info(f"Found {n_gpus_per_node} GPU(-s)")

    # create model 
    model = getModel(ARCH)

    logging.info(f"Training Arch:{ARCH}")

    if not torch.cuda.is_available():
      logging.warning("Using CPU for slow training process")
    else:
      logging.debug("Cuda is available")
      if GPU_ID is not None:
        logging.info(f"Using specific GPU: {GPU_ID}")
        logging.warning("This will reduce the training speed significantly.")
        torch.cuda.set_device(GPU_ID)
        model.cuda(GPU_ID)
      else:
        logging.info("Using all available GPUs")
        for i in range(torch.cuda.device_count()):
            logging.info(f"gpu:{i} - {torch.cuda.get_device_name(i)}")
        model = nn.DataParallel(model).cuda()
    
    # loss function (criterion) and optimizer
    if torch.cuda.is_available():
      logging.info("Move cross entropy to device")
      criterion = nn.CrossEntropyLoss().cuda()
    else:
      criterion = nn.CrossEntropyLoss()
    
    optimizer = torch.optim.SGD(
        model.parameters(), 
        LEARNING_RATE, 
        momentum=MOMENTUM, 
        weight_decay=WEIGHT_DECAY)
    
    cudnn.benchmark = True
    
    train_loader, test_loader, _ = get_zipped_dataloaders(
        os.path.join(os.getcwd(), "data", "imagenet_full"), 
        BATCH_SIZE, 
        use_valid=True)


    # size of batch:
    logging.debug(get_batch_size_stats(train_loader))
    
    if RESUME:
        model, optimizer, start_epoch, best_acc  = resumeFromPath(
            os.path.join(
                os.getcwd(), 
                CHECKPOINT_DIR, 
                f"{ARCH}_{LAST_CHECKPOINT_EPOCH}{CHECKPOINT_POSTFIX}"), 
            model, 
            optimizer)
    else:
        start_epoch = START_EPOCH
        best_acc = 0.0
    
    checkpoint_time = AverageMeter('Checkpoint Time', ':6.3f')
    epoch_time = AverageMeter('Epoch Time', ':6.3f')
    # train loop
    end = time.time()
    for epoch in range(start_epoch, EPOCHS):
        adjust_learning_rate(optimizer, epoch)
        
        # train for one epoch
        logging.debug('Running train loop')
        train(train_loader, model, criterion, optimizer, epoch)
        
        #evaluate the network on test set
        logging.debug('Compute accuracy')
        acc = validate(test_loader, model, criterion)
        
        # remember top acc
        is_best = acc > best_acc
        best_acc = max(acc, best_acc)
        
        # safe model
        if epoch % CHECKPOINT_INTERVALL == 0 or is_best or IS_DEBUG:
            start = time.time()
            save_checkpoint(
                getStateDict(
                    model, 
                    epoch, 
                    ARCH, 
                    best_acc, 
                    optimizer), 
                is_best, ARCH, os.path.join(os.getcwd(), CHECKPOINT_DIR))
            checkpoint_time.update(time.time() - start)
            logging.info(checkpoint_time)
        if IS_DEBUG:
            break
        epoch_time.update(time.time() - end)
        end = time.time()
        logging.info(epoch)
        logging.info(f"Avg-Epoch={epoch_time.avg}sec, Avg-Checkp.={checkpoint_time.avg}sec")
    logging.info(f"Best accuracy: {best_acc}")

In [10]:

tensor = torch.rand(3, 2, 4, 4)
print(tensor)
print()
print(tensor[0:2])

tensor([[[[0.5707, 0.5681, 0.3645, 0.0510],
          [0.7279, 0.7553, 0.3766, 0.5981],
          [0.5244, 0.7212, 0.0399, 0.0585],
          [0.6373, 0.6730, 0.6344, 0.7679]],

         [[0.3602, 0.6710, 0.0085, 0.3773],
          [0.8418, 0.4608, 0.4441, 0.4968],
          [0.3460, 0.4951, 0.9141, 0.2032],
          [0.8242, 0.4571, 0.5274, 0.7714]]],


        [[[0.3617, 0.7562, 0.2090, 0.8389],
          [0.4368, 0.1903, 0.1011, 0.4792],
          [0.2043, 0.7668, 0.7142, 0.7845],
          [0.1320, 0.9561, 0.7371, 0.8890]],

         [[0.6037, 0.3062, 0.7786, 0.3867],
          [0.0067, 0.7528, 0.5455, 0.9434],
          [0.9406, 0.3588, 0.2097, 0.3950],
          [0.0163, 0.8539, 0.6199, 0.0292]]],


        [[[0.3770, 0.5892, 0.3547, 0.1008],
          [0.9942, 0.8421, 0.7856, 0.7889],
          [0.2210, 0.7621, 0.1160, 0.7656],
          [0.5356, 0.6063, 0.4188, 0.0334]],

         [[0.5156, 0.3893, 0.5888, 0.4524],
          [0.8912, 0.3957, 0.3425, 0.2525],
          [0.9201,

In [3]:
def accuracy(output, target, topk=(1,)):
    """Computes accuracy over the k top predictions for the values of k"""
    
    # reduce memory consumption on following calculations
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)
        
        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res

def adjust_learning_rate(optimizer, epoch):
    """
        Sets learning rate to default value, decayed by division with 10 every 25 epochs and 
        updates the lr in the optimizer.
    """
    if not epoch % 25 == 0 and epoch > 0:
        return
    lr = LEARNING_RATE * (0.1 ** (epoch // 25)) 
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

def train(train_loader, model, criterion, optimizer, epoch):
    model.train()
    batch_time = AverageMeter('Batch Time', ':6.3f')
    data_load_time = AverageMeter('Data Time', ':6.3f')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')

    end = time.time()
    for i, (img, target) in enumerate(train_loader):
        
        if GPU_ID is not None:
            img = img.cuda(GPU_ID, non_blocking=True)
        if torch.cuda.is_available():
            target = target.cuda(GPU_ID, non_blocking=True)
        # time it takes to load data
        data_load_time.update(time.time() - end)
        
        # compute output of the current network
        output = model(img)
        loss = criterion(output, target)

        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        
        top1.update(acc1[0], img.size(0))
        top5.update(acc5[0], img.size(0))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # printing statistics every 2000 mini batch size
        if i % STAT_FREQUENCY == STAT_FREQUENCY - 1:            
            logging.info(f'Epoch {epoch} Train loop - Iteration {i}/{len(train_loader)} - Loss {loss}')
            logging.info(top1)
            logging.info(top5)
            logging.info(batch_time)
            logging.info(data_load_time)
        if IS_DEBUG and i == DEBUG_ITERATIONS:
                break
    logging.info(f"Epoch {epoch} train summary: Avg. Acc@1={top1.avg:6.2f} - " 
        + f"Avg. Acc@5={top5.avg:6.2f} - " 
        + f"Avg. Batch={batch_time.avg:6.2f}sec - "
        + f"Avg. DataLoad={data_load_time.avg}sec" )

def validate(val_loader, model, criterion):
    """Compute average accuracy, top 1 and top 5 accuracy"""
    model.eval()
    
    batch_time = AverageMeter('Batch Time', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')

    with torch.no_grad():
        end = time.time()
        for i , (img, target) in enumerate(val_loader):
            # check if could be moved to cuda device
            if GPU_ID is not None:
                img = img.cuda(GPU_ID, non_blocking=True)
            if torch.cuda.is_available():
                target = target.cuda(GPU_ID, non_blocking=True)
                
            # compute output
            output = model(img)
            
            # compute loss
            loss = criterion(output, target)
            
            # measure accuracy and record loss
            prec1, prec5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(),img.size(0))
            top1.update(prec1.item(), img.size(0))
            top5.update(prec5.item(), img.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)


            if i % STAT_FREQUENCY == STAT_FREQUENCY - 1:
                logging.info(f'validation loop {i} of {len(val_loader)}')
                logging.info(losses)
                logging.info(top1)
                logging.info(top5)
                logging.info(batch_time)
            if IS_DEBUG and i == DEBUG_ITERATIONS:
                return top1.avg
    return top1.avg

def loadAndEvaluate():
    model = getModel(ARCH)

    if os.path.exists(os.path.join(CHECKPOINT_DIR, ARCH + '_model_best.pth.tar')):
        logging.debug("Loading best model")
        load_path = os.path.join(CHECKPOINT_DIR, ARCH + '_model_best.pth.tar')
    else:
        logging.debug("Loading default model")
        load_path = os.path.join(CHECKPOINT_DIR, ARCH + '_checkpoint.pth.tar')
    
    logging.debug('Loading: ' + load_path)

    model, _, _ = resumeFromPath(load_path, model)

    logging.debug('Loading Test Data..')

    _, _, testLoader = get_zipped_dataloaders(DATA_PATH, BATCH_SIZE, use_valid=True)
    grndT, pred = evaluateModel(model, testLoader)

    printStats(grndT, pred)

def evaluateModel(model, loader):
    model.eval()

    with torch.no_grad():
        logging.debug(f'Loaded testData with {len(loader.dataset)} testImages and {BATCH_SIZE} images per batch.')

        classes = getClasses(os.path.join(DATA_PATH, 'val'))
        grndT, pred = [], []
        for i, (images, labels) in enumerate(loader):
            logging.debug(f"Evaluating: {i}-th iteration")
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            pred = pred + [classes[predicted[k]] for k in range(BATCH_SIZE)]
            grndT = grndT + [classes[labels[j]] for j in range(BATCH_SIZE)]
            
            if IS_DEBUG and i == DEBUG_ITERATIONS:
                break
        return grndT, pred

In [4]:
import traceback

curTime = datetime.datetime.now()

log_level = logging.DEBUG

logging.basicConfig(level=log_level)
try:
  main(sys.argv)
except Exception as e:
  torch.cuda.empty_cache()
  print("Oh no! Bad things happened...")
  print(e)
  traceback.print_exc()
finally:
  torch.cuda.empty_cache()
#logging.info(f"Top1 Accuracy: {loadAndEvaluate()}")

INFO:root:Found 0 GPU(-s)
INFO:root:Loading model: resnet50
INFO:root:Training Arch:resnet50
/home/alex/Projects/Studium/anytimeDnn/data/imagenet_full/index-train.txt
/home/alex/Projects/Studium/anytimeDnn/data/imagenet_full/index-val.txt
DEBUG:root:Input:
752640 Elements times 4 bytes is 3010560
Target:
5 Elements times 8 bytes is 40
INFO:root:=&gt; no checkpoint found at &#39;/home/alex/Projects/Studium/anytimeDnn/checkpoints/resnet50_0_checkpoint.pth.tar&#39;
DEBUG:root:Running train loop
No file found /home/alex/Projects/Studium/anytimeDnn/checkpoints/resnet50_0_checkpoint.pth.tar
INFO:root:Epoch 0 Train loop - Iteration 9/6379 - Loss 18.753629684448242
INFO:root:Acc@1   0.00 (  4.00) (  0.00) ( 20.00)
INFO:root:Acc@5   0.00 ( 14.00) (  0.00) ( 40.00)
INFO:root:Batch Time 14.378 (15.921) (13.981) (18.125)
INFO:root:Data Time  0.002 ( 0.006) ( 0.001) ( 0.042)


KeyboardInterrupt: 