In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from data.ImagenetDataset import get_imagenet_datasets
from msdnet.dataloader import get_dataloaders_alt
from resnet import ResNet
from densenet import *
#from msdnet.models.msdnet import MSDNet

import os
import shutil
import time

DATA_PATH = "data/imagenet_images"
BATCH_SIZE = 1


LEARNING_RATE = 0.1
MOMENTUM = 0.9
WEIGHT_DECAY = 1e-4
GPU_ID = None
START_EPOCH = 0
EPOCHS = 2
CHECKPOINT_INTERVALL = 10
CHECKPOINT_DIR = 'state'
ARCH = 'densenet'

if __name__ == "__main__":


    train_loader, test_loader, _ = get_dataloaders_alt(
        DATA_PATH, 
        data="ImageNet", 
        use_valid=False, 
        save='save/default-{}'.format(time.time()),
        batch_size=BATCH_SIZE, 
        workers=0, 
        splits=['train', 'test'])
    
    # create model 
    model = ResNet.resnet50()
    
    
    if not torch.cuda.is_available():
        print("Using CPU for slow training process")
    else:
        model = nn.DataParallel(model).cuda()
    
    # loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss()
    
    optimizer = torch.optim.SGD(
        model.parameters(), 
        LEARNING_RATE, 
        momentum=MOMENTUM, 
        weight_decay=WEIGHT_DECAY)
    
    # TODO add loading of checkpoint behaviour

Using CPU for slow training process


In [24]:
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    file_path = os.path.join(CHECKPOINT_DIR, filename)
    
    if not os.path.isdir(CHECKPOINT_DIR):
        os.mkdir(CHECKPOINT_DIR)
    
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')

In [40]:
def accuracy(output, target, topk=(1,)):
    """Computes accuracy over the k top predictions for the values of k"""
    
    # reduce memory consumption on following calculations
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)
        
        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].view(-1).float().sum(0)
            res.append(correct_k.mul_(100.0 / batch_size))
        print(res)
        return res


In [4]:
def adjust_learning_rate(optimizer, epoch):
    """
        Sets learning rate to default value, decayed by division with 10 every 25 epochs and 
        updates the lr in the optimizer.
    """
    lr = LEARNING_RATE * (0.1 ** (epoch // 25)) 
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [26]:
def train(train_loader, model, criterion, optimizer, epoch):
    model.train()
    
    for i, (input, target) in enumerate(train_loader):
        print(f'Train loop {i}')
        if torch.cuda.is_available():
            target = target.cuda(GPU_ID, non_blocking=True)
            input = input.cuda()
            
        
        # compute output of the current network
        output = model(input)
        loss = criterion(output, target)
        print('computed output')
        
        print('optimizer step')
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        
        output = output.float()
        loss = loss.float()
        
        # printing statistics every 2000 mini batch size
        if i % 2 == 0:
            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            
            print(f'Epoch {epoch} - Iteration {i}/{len(train_loader)} - Loss {loss} - Acc1 {acc1} - Acc5 {acc5}')
            return


In [32]:
def validate(val_loader, model, criterion):
    """Compute average accuracy, top 1 and top 5 accuracy"""
    model.eval()
    
    losses = AverageMeter()
    top1 = AverageMeter()
    ''
    with torch.no_grad():
        for i , (input, target) in enumerate(val_loader):
            print(f'validation loop {i}')
            # check if could be moved to cuda device
            if torch.cuda.is_available():
                target = target.cuda(GPU_ID, non_blocking=True)
                input = input.cuda()
                
            # compute output
            output = model(input)
            
            # compute loss
            loss = criterion(output, target)
            
            # measure accuracy and record loss
            prec1 = accuracy(output.data, target)[0]
            losses.update(loss.item(),input.size(0))
            print(prec1)
            top1.update(prec1.item(), input.size(0))
            
            if i == 15:
                print(f'Validation run\n - Loss {losses.val} - Loss Avg. {losses.avg}\n - Prec {top1.val} - Prec Avg. {top1.avg}')
                return top1.avg

In [22]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [39]:
# train loop
#for epoch in range(START_EPOCH, EPOCHS):
best_acc = 0.0
for epoch in range(0, 1):
    adjust_learning_rate(optimizer, epoch)
    
    # train for one epoch
    print('Running train loop')
    train(train_loader, model, criterion, optimizer, epoch)
    
    #evaluate the network on test set
    print('Compute accuracy')
    acc = validate(test_loader, model, criterion)
    
    # remember top acc
    is_best = acc > best_acc
    best_acc = max(acc, best_acc)
    
    # safe model
    if epoch % CHECKPOINT_INTERVALL == 0 or is_best:
        save_checkpoint({
            'epoch': epoch + 1,
            'arch': ARCH,
            'state_dict': model.state_dict(),
            'best_acc': best_acc,
            'optimizer': optimizer.state_dict(),
        }, is_best)

Running train loop
Train loop 0
computed output
optimizer step
Accuracy
tensor(187)
[tensor(0.), tensor(0.)]
Epoch 0 - Iteration 0/123016 - Loss 10.617687225341797 - Acc1 0.0 - Acc5 0.0
Compute accuracy
validation loop 0
Accuracy
tensor(1)
[tensor(0.)]
tensor(0.)
validation loop 1
Accuracy
tensor(1)
[tensor(0.)]
tensor(0.)
validation loop 2
Accuracy
tensor(1)
[tensor(0.)]
tensor(0.)
validation loop 3
Accuracy
tensor(1)
[tensor(0.)]
tensor(0.)
validation loop 4
Accuracy
tensor(1)
[tensor(0.)]
tensor(0.)
validation loop 5
Accuracy
tensor(1)
[tensor(0.)]
tensor(0.)
validation loop 6
Accuracy
tensor(1)
[tensor(0.)]
tensor(0.)
validation loop 7
Accuracy
tensor(1)
[tensor(0.)]
tensor(0.)
validation loop 8
Accuracy
tensor(1)
[tensor(0.)]
tensor(0.)
validation loop 9
Accuracy
tensor(1)
[tensor(0.)]
tensor(0.)
validation loop 10
Accuracy
tensor(1)
[tensor(0.)]
tensor(0.)
validation loop 11
Accuracy
tensor(1)
[tensor(0.)]
tensor(0.)
validation loop 12
Accuracy
tensor(1)
[tensor(0.)]
tensor(0.)
v